diff options
Diffstat (limited to 'kernel')
229 files changed, 27343 insertions, 15266 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 1e19722c64c3..cbbf79d718cf 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -244,7 +244,7 @@ static int acct_on(const char __user *name) if (!S_ISREG(file_inode(file)->i_mode)) return -EACCES; - /* Exclude kernel kernel internal filesystems. */ + /* Exclude kernel internal filesystems. */ if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) return -EINVAL; diff --git a/kernel/audit.c b/kernel/audit.c index 5a0216056524..e1d489bc2dff 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -355,8 +355,8 @@ void audit_panic(const char *message) static inline int audit_rate_check(void) { - static unsigned long last_check = 0; - static int messages = 0; + static unsigned long last_check; + static int messages; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; @@ -391,7 +391,7 @@ static inline int audit_rate_check(void) */ void audit_log_lost(const char *message) { - static unsigned long last_msg = 0; + static unsigned long last_msg; static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned long now; @@ -1295,6 +1295,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); + if (s.mask & ~AUDIT_STATUS_ALL) + return -EINVAL; if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) diff --git a/kernel/audit.h b/kernel/audit.h index 7c401729e21b..ac81fa02bcd7 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -76,7 +76,7 @@ struct audit_names { int name_len; /* number of chars to log */ bool hidden; /* don't log this record */ - unsigned long ino; + u64 ino; dev_t dev; umode_t mode; kuid_t uid; @@ -225,9 +225,9 @@ extern int auditd_test_task(struct task_struct *task); #define AUDIT_INODE_BUCKETS 32 extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; -static inline int audit_hash_ino(u32 ino) +static inline int audit_hash_ino(u64 ino) { - return (ino & (AUDIT_INODE_BUCKETS-1)); + return ((u32)ino & (AUDIT_INODE_BUCKETS-1)); } /* Indicates that audit should log the full pathname. */ @@ -277,16 +277,15 @@ extern int audit_to_watch(struct audit_krule *krule, char *path, int len, extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); -extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, - dev_t dev); +extern int audit_watch_compare(struct audit_watch *watch, u64 ino, dev_t dev); extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); extern char *audit_mark_path(struct audit_fsnotify_mark *mark); extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); extern void audit_remove_mark_rule(struct audit_krule *krule); -extern int audit_mark_compare(struct audit_fsnotify_mark *mark, - unsigned long ino, dev_t dev); +extern int audit_mark_compare(struct audit_fsnotify_mark *mark, u64 ino, + dev_t dev); extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index a4401f651060..711454f9f724 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -25,7 +25,7 @@ */ struct audit_fsnotify_mark { dev_t dev; /* associated superblock device */ - unsigned long ino; /* associated inode number */ + u64 ino; /* associated inode number */ char *path; /* insertion path */ struct fsnotify_mark mark; /* fsnotify mark on the inode */ struct audit_krule *rule; @@ -57,7 +57,7 @@ char *audit_mark_path(struct audit_fsnotify_mark *mark) return mark->path; } -int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev) +int audit_mark_compare(struct audit_fsnotify_mark *mark, u64 ino, dev_t dev) { if (mark->ino == AUDIT_INO_UNSET) return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 096faac2435c..33577f0f54ef 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -37,7 +37,7 @@ struct audit_watch { refcount_t count; /* reference count */ dev_t dev; /* associated superblock device */ char *path; /* insertion path */ - unsigned long ino; /* associated inode number */ + u64 ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ struct list_head rules; /* anchor for krule->rlist */ @@ -125,7 +125,7 @@ char *audit_watch_path(struct audit_watch *watch) return watch->path; } -int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) +int audit_watch_compare(struct audit_watch *watch, u64 ino, dev_t dev) { return (watch->ino != AUDIT_INO_UNSET) && (watch->ino == ino) && @@ -244,7 +244,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc /* Update inode info in audit rules based on filesystem event. */ static void audit_update_watch(struct audit_parent *parent, const struct qstr *dname, dev_t dev, - unsigned long ino, unsigned invalidating) + u64 ino, unsigned invalidating) { struct audit_watch *owatch, *nwatch, *nextw; struct audit_krule *r, *nextr; @@ -285,7 +285,7 @@ static void audit_update_watch(struct audit_parent *parent, list_del(&oentry->rule.list); audit_panic("error updating watch, removing"); } else { - int h = audit_hash_ino((u32)ino); + int h = audit_hash_ino(ino); /* * nentry->rule.watch == oentry->rule.watch so @@ -439,7 +439,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) audit_add_to_parent(krule, parent); - h = audit_hash_ino((u32)watch->ino); + h = audit_hash_ino(watch->ino); *list = &audit_inode_hash[h]; error: path_put(&parent_path); @@ -527,7 +527,7 @@ int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old) int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) { struct file *exe_file; - unsigned long ino; + u64 ino; dev_t dev; /* only do exe filtering if we are recording @current events/records */ diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6e3abbf08e3d..093425123f6c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -303,8 +303,7 @@ exit_err: return ERR_PTR(err); } -static u32 audit_ops[] = -{ +static u32 audit_ops[] = { [Audit_equal] = AUDIT_EQUAL, [Audit_not_equal] = AUDIT_NOT_EQUAL, [Audit_bitmask] = AUDIT_BIT_MASK, diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f6af6a8f68c4..ab54fccba215 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -886,7 +886,7 @@ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_names *n, struct audit_context *ctx) { - int h = audit_hash_ino((u32)n->ino); + int h = audit_hash_ino(n->ino); struct list_head *list = &audit_inode_hash[h]; return __audit_filter_op(tsk, ctx, list, n, ctx->major); @@ -1534,7 +1534,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, audit_log_format(ab, " name=(null)"); if (n->ino != AUDIT_INO_UNSET) - audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", + audit_log_format(ab, " inode=%llu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", n->ino, MAJOR(n->dev), MINOR(n->dev), diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 79cf22860a99..399007b67a92 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,11 +6,12 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o const_fold.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o +obj-$(CONFIG_BPF_SYSCALL) += fixups.o cfg.o states.o backtrack.o check_btf.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 144f30e740e8..802656c6fd3c 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -303,7 +303,7 @@ static long arena_map_update_elem(struct bpf_map *map, void *key, return -EOPNOTSUPP; } -static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf, +static int arena_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { return 0; @@ -341,6 +341,16 @@ static void arena_vm_open(struct vm_area_struct *vma) refcount_inc(&vml->mmap_count); } +static int arena_vm_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + return -EINVAL; +} + +static int arena_vm_mremap(struct vm_area_struct *vma) +{ + return -EINVAL; +} + static void arena_vm_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; @@ -417,6 +427,8 @@ out_unlock_sigsegv: static const struct vm_operations_struct arena_vm_ops = { .open = arena_vm_open, + .may_split = arena_vm_may_split, + .mremap = arena_vm_mremap, .close = arena_vm_close, .fault = arena_vm_fault, }; @@ -486,10 +498,11 @@ static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) arena->user_vm_end = vma->vm_end; /* * bpf_map_mmap() checks that it's being mmaped as VM_SHARED and - * clears VM_MAYEXEC. Set VM_DONTEXPAND as well to avoid - * potential change of user_vm_start. + * clears VM_MAYEXEC. Set VM_DONTEXPAND to avoid potential change + * of user_vm_start. Set VM_DONTCOPY to prevent arena VMA from + * being copied into the child process on fork. */ - vm_flags_set(vma, VM_DONTEXPAND); + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTCOPY); vma->vm_ops = &arena_vm_ops; return 0; } @@ -549,6 +562,10 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt u32 uaddr32; int ret, i; + if (node_id != NUMA_NO_NODE && + ((unsigned int)node_id >= nr_node_ids || !node_online(node_id))) + return 0; + if (page_cnt > page_cnt_max) return 0; @@ -656,8 +673,7 @@ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) guard(mutex)(&arena->lock); /* iterate link list under lock */ list_for_each_entry(vml, &arena->vma_list, head) - zap_page_range_single(vml->vma, uaddr, - PAGE_SIZE * page_cnt, NULL); + zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 26763df6134a..5e25e0353509 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -548,7 +548,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int array_map_check_btf(const struct bpf_map *map, +static int array_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) @@ -1015,8 +1015,10 @@ static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - for (i = 0; i < array->map.max_entries; i++) + for (i = 0; i < array->map.max_entries; i++) { __fd_array_map_delete_elem(map, &i, need_defer); + cond_resched(); + } } static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c new file mode 100644 index 000000000000..854731dc93fe --- /dev/null +++ b/kernel/bpf/backtrack.c @@ -0,0 +1,934 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> +#include <linux/filter.h> +#include <linux/bitmap.h> + +#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) + +/* for any branch, call, exit record the history of jmps in the given state */ +int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags, u64 linked_regs) +{ + u32 cnt = cur->jmp_history_cnt; + struct bpf_jmp_history_entry *p; + size_t alloc_size; + + /* combine instruction flags if we already recorded this instruction */ + if (env->cur_hist_ent) { + /* atomic instructions push insn_flags twice, for READ and + * WRITE sides, but they should agree on stack slot + */ + verifier_bug_if((env->cur_hist_ent->flags & insn_flags) && + (env->cur_hist_ent->flags & insn_flags) != insn_flags, + env, "insn history: insn_idx %d cur flags %x new flags %x", + env->insn_idx, env->cur_hist_ent->flags, insn_flags); + env->cur_hist_ent->flags |= insn_flags; + verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env, + "insn history: insn_idx %d linked_regs: %#llx", + env->insn_idx, env->cur_hist_ent->linked_regs); + env->cur_hist_ent->linked_regs = linked_regs; + return 0; + } + + cnt++; + alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); + p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT); + if (!p) + return -ENOMEM; + cur->jmp_history = p; + + p = &cur->jmp_history[cnt - 1]; + p->idx = env->insn_idx; + p->prev_idx = env->prev_insn_idx; + p->flags = insn_flags; + p->linked_regs = linked_regs; + cur->jmp_history_cnt = cnt; + env->cur_hist_ent = p; + + return 0; +} + +static bool is_atomic_load_insn(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ; +} + +static bool is_atomic_fetch_insn(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_ATOMIC && + (insn->imm & BPF_FETCH); +} + +static int insn_stack_access_spi(int insn_flags) +{ + return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK; +} + +static int insn_stack_access_frameno(int insn_flags) +{ + return insn_flags & INSN_F_FRAMENO_MASK; +} + +/* Backtrack one insn at a time. If idx is not at the top of recorded + * history then previous instruction came from straight line execution. + * Return -ENOENT if we exhausted all instructions within given state. + * + * It's legal to have a bit of a looping with the same starting and ending + * insn index within the same state, e.g.: 3->4->5->3, so just because current + * instruction index is the same as state's first_idx doesn't mean we are + * done. If there is still some jump history left, we should keep going. We + * need to take into account that we might have a jump history between given + * state's parent and itself, due to checkpointing. In this case, we'll have + * history entry recording a jump from last instruction of parent state and + * first instruction of given state. + */ +static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, + u32 *history) +{ + u32 cnt = *history; + + if (i == st->first_insn_idx) { + if (cnt == 0) + return -ENOENT; + if (cnt == 1 && st->jmp_history[0].idx == i) + return -ENOENT; + } + + if (cnt && st->jmp_history[cnt - 1].idx == i) { + i = st->jmp_history[cnt - 1].prev_idx; + (*history)--; + } else { + i--; + } + return i; +} + +static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, + u32 hist_end, int insn_idx) +{ + if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) + return &st->jmp_history[hist_end - 1]; + return NULL; +} + +static inline void bt_init(struct backtrack_state *bt, u32 frame) +{ + bt->frame = frame; +} + +static inline void bt_reset(struct backtrack_state *bt) +{ + struct bpf_verifier_env *env = bt->env; + + memset(bt, 0, sizeof(*bt)); + bt->env = env; +} + +static inline u32 bt_empty(struct backtrack_state *bt) +{ + u64 mask = 0; + int i; + + for (i = 0; i <= bt->frame; i++) + mask |= bt->reg_masks[i] | bt->stack_masks[i]; + + return mask == 0; +} + +static inline int bt_subprog_enter(struct backtrack_state *bt) +{ + if (bt->frame == MAX_CALL_FRAMES - 1) { + verifier_bug(bt->env, "subprog enter from frame %d", bt->frame); + return -EFAULT; + } + bt->frame++; + return 0; +} + +static inline int bt_subprog_exit(struct backtrack_state *bt) +{ + if (bt->frame == 0) { + verifier_bug(bt->env, "subprog exit from frame 0"); + return -EFAULT; + } + bt->frame--; + return 0; +} + +static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg) +{ + bt->reg_masks[frame] &= ~(1 << reg); +} + +static inline void bt_set_reg(struct backtrack_state *bt, u32 reg) +{ + bpf_bt_set_frame_reg(bt, bt->frame, reg); +} + +static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg) +{ + bt_clear_frame_reg(bt, bt->frame, reg); +} + +static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_masks[frame] &= ~(1ull << slot); +} + +static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame) +{ + return bt->reg_masks[frame]; +} + +static inline u32 bt_reg_mask(struct backtrack_state *bt) +{ + return bt->reg_masks[bt->frame]; +} + +static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame) +{ + return bt->stack_masks[frame]; +} + +static inline u64 bt_stack_mask(struct backtrack_state *bt) +{ + return bt->stack_masks[bt->frame]; +} + +static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) +{ + return bt->reg_masks[bt->frame] & (1 << reg); +} + + +/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */ +static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask) +{ + DECLARE_BITMAP(mask, 64); + bool first = true; + int i, n; + + buf[0] = '\0'; + + bitmap_from_u64(mask, reg_mask); + for_each_set_bit(i, mask, 32) { + n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i); + first = false; + buf += n; + buf_sz -= n; + if (buf_sz < 0) + break; + } +} +/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */ +void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) +{ + DECLARE_BITMAP(mask, 64); + bool first = true; + int i, n; + + buf[0] = '\0'; + + bitmap_from_u64(mask, stack_mask); + for_each_set_bit(i, mask, 64) { + n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8); + first = false; + buf += n; + buf_sz -= n; + if (buf_sz < 0) + break; + } +} + + +/* For given verifier state backtrack_insn() is called from the last insn to + * the first insn. Its purpose is to compute a bitmask of registers and + * stack slots that needs precision in the parent verifier state. + * + * @idx is an index of the instruction we are currently processing; + * @subseq_idx is an index of the subsequent instruction that: + * - *would be* executed next, if jump history is viewed in forward order; + * - *was* processed previously during backtracking. + */ +static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, + struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) +{ + struct bpf_insn *insn = env->prog->insnsi + idx; + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u32 dreg = insn->dst_reg; + u32 sreg = insn->src_reg; + u32 spi, i, fr; + + if (insn->code == 0) + return 0; + if (env->log.level & BPF_LOG_LEVEL2) { + fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt)); + verbose(env, "mark_precise: frame%d: regs=%s ", + bt->frame, env->tmp_str_buf); + bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); + verbose(env, "stack=%s before ", env->tmp_str_buf); + verbose(env, "%d: ", idx); + bpf_verbose_insn(env, insn); + } + + /* If there is a history record that some registers gained range at this insn, + * propagate precision marks to those registers, so that bt_is_reg_set() + * accounts for these registers. + */ + bpf_bt_sync_linked_regs(bt, hist); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (!bt_is_reg_set(bt, dreg)) + return 0; + if (opcode == BPF_END || opcode == BPF_NEG) { + /* sreg is reserved and unused + * dreg still need precision before this insn + */ + return 0; + } else if (opcode == BPF_MOV) { + if (BPF_SRC(insn->code) == BPF_X) { + /* dreg = sreg or dreg = (s8, s16, s32)sreg + * dreg needs precision after this insn + * sreg needs precision before this insn + */ + bt_clear_reg(bt, dreg); + if (sreg != BPF_REG_FP) + bt_set_reg(bt, sreg); + } else { + /* dreg = K + * dreg needs precision after this insn. + * Corresponding register is already marked + * as precise=true in this verifier state. + * No further markings in parent are necessary + */ + bt_clear_reg(bt, dreg); + } + } else { + if (BPF_SRC(insn->code) == BPF_X) { + /* dreg += sreg + * both dreg and sreg need precision + * before this insn + */ + if (sreg != BPF_REG_FP) + bt_set_reg(bt, sreg); + } /* else dreg += K + * dreg still needs precision before this insn + */ + } + } else if (class == BPF_LDX || + is_atomic_load_insn(insn) || + is_atomic_fetch_insn(insn)) { + u32 load_reg = dreg; + + /* + * Atomic fetch operation writes the old value into + * a register (sreg or r0) and if it was tracked for + * precision, propagate to the stack slot like we do + * in regular ldx. + */ + if (is_atomic_fetch_insn(insn)) + load_reg = insn->imm == BPF_CMPXCHG ? + BPF_REG_0 : sreg; + + if (!bt_is_reg_set(bt, load_reg)) + return 0; + bt_clear_reg(bt, load_reg); + + /* scalars can only be spilled into stack w/o losing precision. + * Load from any other memory can be zero extended. + * The desire to keep that precision is already indicated + * by 'precise' mark in corresponding register of this state. + * No further tracking necessary. + */ + if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) + return 0; + /* dreg = *(u64 *)[fp - off] was a fill from the stack. + * that [fp - off] slot contains scalar that needs to be + * tracked with precision + */ + spi = insn_stack_access_spi(hist->flags); + fr = insn_stack_access_frameno(hist->flags); + bpf_bt_set_frame_slot(bt, fr, spi); + } else if (class == BPF_STX || class == BPF_ST) { + if (bt_is_reg_set(bt, dreg)) + /* stx & st shouldn't be using _scalar_ dst_reg + * to access memory. It means backtracking + * encountered a case of pointer subtraction. + */ + return -ENOTSUPP; + /* scalars can only be spilled into stack */ + if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) + return 0; + spi = insn_stack_access_spi(hist->flags); + fr = insn_stack_access_frameno(hist->flags); + if (!bt_is_frame_slot_set(bt, fr, spi)) + return 0; + bt_clear_frame_slot(bt, fr, spi); + if (class == BPF_STX) + bt_set_reg(bt, sreg); + } else if (class == BPF_JMP || class == BPF_JMP32) { + if (bpf_pseudo_call(insn)) { + int subprog_insn_idx, subprog; + + subprog_insn_idx = idx + insn->imm + 1; + subprog = bpf_find_subprog(env, subprog_insn_idx); + if (subprog < 0) + return -EFAULT; + + if (bpf_subprog_is_global(env, subprog)) { + /* check that jump history doesn't have any + * extra instructions from subprog; the next + * instruction after call to global subprog + * should be literally next instruction in + * caller program + */ + verifier_bug_if(idx + 1 != subseq_idx, env, + "extra insn from subprog"); + /* r1-r5 are invalidated after subprog call, + * so for global func call it shouldn't be set + * anymore + */ + if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { + verifier_bug(env, "global subprog unexpected regs %x", + bt_reg_mask(bt)); + return -EFAULT; + } + /* global subprog always sets R0 */ + bt_clear_reg(bt, BPF_REG_0); + return 0; + } else { + /* static subprog call instruction, which + * means that we are exiting current subprog, + * so only r1-r5 could be still requested as + * precise, r0 and r6-r10 or any stack slot in + * the current frame should be zero by now + */ + if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { + verifier_bug(env, "static subprog unexpected regs %x", + bt_reg_mask(bt)); + return -EFAULT; + } + /* we are now tracking register spills correctly, + * so any instance of leftover slots is a bug + */ + if (bt_stack_mask(bt) != 0) { + verifier_bug(env, + "static subprog leftover stack slots %llx", + bt_stack_mask(bt)); + return -EFAULT; + } + /* propagate r1-r5 to the caller */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) { + if (bt_is_reg_set(bt, i)) { + bt_clear_reg(bt, i); + bpf_bt_set_frame_reg(bt, bt->frame - 1, i); + } + } + if (bt_subprog_exit(bt)) + return -EFAULT; + return 0; + } + } else if (bpf_is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) { + /* exit from callback subprog to callback-calling helper or + * kfunc call. Use idx/subseq_idx check to discern it from + * straight line code backtracking. + * Unlike the subprog call handling above, we shouldn't + * propagate precision of r1-r5 (if any requested), as they are + * not actually arguments passed directly to callback subprogs + */ + if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { + verifier_bug(env, "callback unexpected regs %x", + bt_reg_mask(bt)); + return -EFAULT; + } + if (bt_stack_mask(bt) != 0) { + verifier_bug(env, "callback leftover stack slots %llx", + bt_stack_mask(bt)); + return -EFAULT; + } + /* clear r1-r5 in callback subprog's mask */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + bt_clear_reg(bt, i); + if (bt_subprog_exit(bt)) + return -EFAULT; + return 0; + } else if (opcode == BPF_CALL) { + /* kfunc with imm==0 is invalid and fixup_kfunc_call will + * catch this error later. Make backtracking conservative + * with ENOTSUPP. + */ + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0) + return -ENOTSUPP; + /* regular helper call sets R0 */ + bt_clear_reg(bt, BPF_REG_0); + if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { + /* if backtracking was looking for registers R1-R5 + * they should have been found already. + */ + verifier_bug(env, "backtracking call unexpected regs %x", + bt_reg_mask(bt)); + return -EFAULT; + } + if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call + && subseq_idx - idx != 1) { + if (bt_subprog_enter(bt)) + return -EFAULT; + } + } else if (opcode == BPF_EXIT) { + bool r0_precise; + + /* Backtracking to a nested function call, 'idx' is a part of + * the inner frame 'subseq_idx' is a part of the outer frame. + * In case of a regular function call, instructions giving + * precision to registers R1-R5 should have been found already. + * In case of a callback, it is ok to have R1-R5 marked for + * backtracking, as these registers are set by the function + * invoking callback. + */ + if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx)) + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + bt_clear_reg(bt, i); + if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { + verifier_bug(env, "backtracking exit unexpected regs %x", + bt_reg_mask(bt)); + return -EFAULT; + } + + /* BPF_EXIT in subprog or callback always returns + * right after the call instruction, so by checking + * whether the instruction at subseq_idx-1 is subprog + * call or not we can distinguish actual exit from + * *subprog* from exit from *callback*. In the former + * case, we need to propagate r0 precision, if + * necessary. In the former we never do that. + */ + r0_precise = subseq_idx - 1 >= 0 && + bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) && + bt_is_reg_set(bt, BPF_REG_0); + + bt_clear_reg(bt, BPF_REG_0); + if (bt_subprog_enter(bt)) + return -EFAULT; + + if (r0_precise) + bt_set_reg(bt, BPF_REG_0); + /* r6-r9 and stack slots will stay set in caller frame + * bitmasks until we return back from callee(s) + */ + return 0; + } else if (BPF_SRC(insn->code) == BPF_X) { + if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg)) + return 0; + /* dreg <cond> sreg + * Both dreg and sreg need precision before + * this insn. If only sreg was marked precise + * before it would be equally necessary to + * propagate it to dreg. + */ + if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK)) + bt_set_reg(bt, sreg); + if (!hist || !(hist->flags & INSN_F_DST_REG_STACK)) + bt_set_reg(bt, dreg); + } else if (BPF_SRC(insn->code) == BPF_K) { + /* dreg <cond> K + * Only dreg still needs precision before + * this insn, so for the K-based conditional + * there is nothing new to be marked. + */ + } + } else if (class == BPF_LD) { + if (!bt_is_reg_set(bt, dreg)) + return 0; + bt_clear_reg(bt, dreg); + /* It's ld_imm64 or ld_abs or ld_ind. + * For ld_imm64 no further tracking of precision + * into parent is necessary + */ + if (mode == BPF_IND || mode == BPF_ABS) + /* to be analyzed */ + return -ENOTSUPP; + } + /* Propagate precision marks to linked registers, to account for + * registers marked as precise in this function. + */ + bpf_bt_sync_linked_regs(bt, hist); + return 0; +} + +/* the scalar precision tracking algorithm: + * . at the start all registers have precise=false. + * . scalar ranges are tracked as normal through alu and jmp insns. + * . once precise value of the scalar register is used in: + * . ptr + scalar alu + * . if (scalar cond K|scalar) + * . helper_call(.., scalar, ...) where ARG_CONST is expected + * backtrack through the verifier states and mark all registers and + * stack slots with spilled constants that these scalar registers + * should be precise. + * . during state pruning two registers (or spilled stack slots) + * are equivalent if both are not precise. + * + * Note the verifier cannot simply walk register parentage chain, + * since many different registers and stack slots could have been + * used to compute single precise scalar. + * + * The approach of starting with precise=true for all registers and then + * backtrack to mark a register as not precise when the verifier detects + * that program doesn't care about specific value (e.g., when helper + * takes register as ARG_ANYTHING parameter) is not safe. + * + * It's ok to walk single parentage chain of the verifier states. + * It's possible that this backtracking will go all the way till 1st insn. + * All other branches will be explored for needing precision later. + * + * The backtracking needs to deal with cases like: + * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0) + * r9 -= r8 + * r5 = r9 + * if r5 > 0x79f goto pc+7 + * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff)) + * r5 += 1 + * ... + * call bpf_perf_event_output#25 + * where .arg5_type = ARG_CONST_SIZE_OR_ZERO + * + * and this case: + * r6 = 1 + * call foo // uses callee's r6 inside to compute r0 + * r0 += r6 + * if r0 == 0 goto + * + * to track above reg_mask/stack_mask needs to be independent for each frame. + * + * Also if parent's curframe > frame where backtracking started, + * the verifier need to mark registers in both frames, otherwise callees + * may incorrectly prune callers. This is similar to + * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") + * + * For now backtracking falls back into conservative marking. + */ +void bpf_mark_all_scalars_precise(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n", + st->curframe); + } + + /* big hammer: mark all scalars precise in this path. + * pop_stack may still get !precise scalars. + * We also skip current state and go straight to first parent state, + * because precision markings in current non-checkpointed state are + * not needed. See why in the comment in __mark_chain_precision below. + */ + for (st = st->parent; st; st = st->parent) { + for (i = 0; i <= st->curframe; i++) { + func = st->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + if (reg->type != SCALAR_VALUE || reg->precise) + continue; + reg->precise = true; + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "force_precise: frame%d: forcing r%d to be precise\n", + i, j); + } + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (!bpf_is_spilled_reg(&func->stack[j])) + continue; + reg = &func->stack[j].spilled_ptr; + if (reg->type != SCALAR_VALUE || reg->precise) + continue; + reg->precise = true; + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n", + i, -(j + 1) * 8); + } + } + } + } +} + +/* + * bpf_mark_chain_precision() backtracks BPF program instruction sequence and + * chain of verifier states making sure that register *regno* (if regno >= 0) + * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked + * SCALARS, as well as any other registers and slots that contribute to + * a tracked state of given registers/stack slots, depending on specific BPF + * assembly instructions (see backtrack_insns() for exact instruction handling + * logic). This backtracking relies on recorded jmp_history and is able to + * traverse entire chain of parent states. This process ends only when all the + * necessary registers/slots and their transitive dependencies are marked as + * precise. + * + * One important and subtle aspect is that precise marks *do not matter* in + * the currently verified state (current state). It is important to understand + * why this is the case. + * + * First, note that current state is the state that is not yet "checkpointed", + * i.e., it is not yet put into env->explored_states, and it has no children + * states as well. It's ephemeral, and can end up either a) being discarded if + * compatible explored state is found at some point or BPF_EXIT instruction is + * reached or b) checkpointed and put into env->explored_states, branching out + * into one or more children states. + * + * In the former case, precise markings in current state are completely + * ignored by state comparison code (see regsafe() for details). Only + * checkpointed ("old") state precise markings are important, and if old + * state's register/slot is precise, regsafe() assumes current state's + * register/slot as precise and checks value ranges exactly and precisely. If + * states turn out to be compatible, current state's necessary precise + * markings and any required parent states' precise markings are enforced + * after the fact with propagate_precision() logic, after the fact. But it's + * important to realize that in this case, even after marking current state + * registers/slots as precise, we immediately discard current state. So what + * actually matters is any of the precise markings propagated into current + * state's parent states, which are always checkpointed (due to b) case above). + * As such, for scenario a) it doesn't matter if current state has precise + * markings set or not. + * + * Now, for the scenario b), checkpointing and forking into child(ren) + * state(s). Note that before current state gets to checkpointing step, any + * processed instruction always assumes precise SCALAR register/slot + * knowledge: if precise value or range is useful to prune jump branch, BPF + * verifier takes this opportunity enthusiastically. Similarly, when + * register's value is used to calculate offset or memory address, exact + * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to + * what we mentioned above about state comparison ignoring precise markings + * during state comparison, BPF verifier ignores and also assumes precise + * markings *at will* during instruction verification process. But as verifier + * assumes precision, it also propagates any precision dependencies across + * parent states, which are not yet finalized, so can be further restricted + * based on new knowledge gained from restrictions enforced by their children + * states. This is so that once those parent states are finalized, i.e., when + * they have no more active children state, state comparison logic in + * is_state_visited() would enforce strict and precise SCALAR ranges, if + * required for correctness. + * + * To build a bit more intuition, note also that once a state is checkpointed, + * the path we took to get to that state is not important. This is crucial + * property for state pruning. When state is checkpointed and finalized at + * some instruction index, it can be correctly and safely used to "short + * circuit" any *compatible* state that reaches exactly the same instruction + * index. I.e., if we jumped to that instruction from a completely different + * code path than original finalized state was derived from, it doesn't + * matter, current state can be discarded because from that instruction + * forward having a compatible state will ensure we will safely reach the + * exit. States describe preconditions for further exploration, but completely + * forget the history of how we got here. + * + * This also means that even if we needed precise SCALAR range to get to + * finalized state, but from that point forward *that same* SCALAR register is + * never used in a precise context (i.e., it's precise value is not needed for + * correctness), it's correct and safe to mark such register as "imprecise" + * (i.e., precise marking set to false). This is what we rely on when we do + * not set precise marking in current state. If no child state requires + * precision for any given SCALAR register, it's safe to dictate that it can + * be imprecise. If any child state does require this register to be precise, + * we'll mark it precise later retroactively during precise markings + * propagation from child state to parent states. + * + * Skipping precise marking setting in current state is a mild version of + * relying on the above observation. But we can utilize this property even + * more aggressively by proactively forgetting any precise marking in the + * current state (which we inherited from the parent state), right before we + * checkpoint it and branch off into new child state. This is done by + * mark_all_scalars_imprecise() to hopefully get more permissive and generic + * finalized states which help in short circuiting more future states. + */ +int bpf_mark_chain_precision(struct bpf_verifier_env *env, + struct bpf_verifier_state *starting_state, + int regno, + bool *changed) +{ + struct bpf_verifier_state *st = starting_state; + struct backtrack_state *bt = &env->bt; + int first_idx = st->first_insn_idx; + int last_idx = starting_state->insn_idx; + int subseq_idx = -1; + struct bpf_func_state *func; + bool tmp, skip_first = true; + struct bpf_reg_state *reg; + int i, fr, err; + + if (!env->bpf_capable) + return 0; + + changed = changed ?: &tmp; + /* set frame number from which we are starting to backtrack */ + bt_init(bt, starting_state->curframe); + + /* Do sanity checks against current state of register and/or stack + * slot, but don't set precise flag in current state, as precision + * tracking in the current state is unnecessary. + */ + func = st->frame[bt->frame]; + if (regno >= 0) { + reg = &func->regs[regno]; + if (reg->type != SCALAR_VALUE) { + verifier_bug(env, "backtracking misuse"); + return -EFAULT; + } + bt_set_reg(bt, regno); + } + + if (bt_empty(bt)) + return 0; + + for (;;) { + DECLARE_BITMAP(mask, 64); + u32 history = st->jmp_history_cnt; + struct bpf_jmp_history_entry *hist; + + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", + bt->frame, last_idx, first_idx, subseq_idx); + } + + if (last_idx < 0) { + /* we are at the entry into subprog, which + * is expected for global funcs, but only if + * requested precise registers are R1-R5 + * (which are global func's input arguments) + */ + if (st->curframe == 0 && + st->frame[0]->subprogno > 0 && + st->frame[0]->callsite == BPF_MAIN_FUNC && + bt_stack_mask(bt) == 0 && + (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) { + bitmap_from_u64(mask, bt_reg_mask(bt)); + for_each_set_bit(i, mask, 32) { + reg = &st->frame[0]->regs[i]; + bt_clear_reg(bt, i); + if (reg->type == SCALAR_VALUE) { + reg->precise = true; + *changed = true; + } + } + return 0; + } + + verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx", + st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); + return -EFAULT; + } + + for (i = last_idx;;) { + if (skip_first) { + err = 0; + skip_first = false; + } else { + hist = get_jmp_hist_entry(st, history, i); + err = backtrack_insn(env, i, subseq_idx, hist, bt); + } + if (err == -ENOTSUPP) { + bpf_mark_all_scalars_precise(env, starting_state); + bt_reset(bt); + return 0; + } else if (err) { + return err; + } + if (bt_empty(bt)) + /* Found assignment(s) into tracked register in this state. + * Since this state is already marked, just return. + * Nothing to be tracked further in the parent state. + */ + return 0; + subseq_idx = i; + i = get_prev_insn_idx(st, i, &history); + if (i == -ENOENT) + break; + if (i >= env->prog->len) { + /* This can happen if backtracking reached insn 0 + * and there are still reg_mask or stack_mask + * to backtrack. + * It means the backtracking missed the spot where + * particular register was initialized with a constant. + */ + verifier_bug(env, "backtracking idx %d", i); + return -EFAULT; + } + } + st = st->parent; + if (!st) + break; + + for (fr = bt->frame; fr >= 0; fr--) { + func = st->frame[fr]; + bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr)); + for_each_set_bit(i, mask, 32) { + reg = &func->regs[i]; + if (reg->type != SCALAR_VALUE) { + bt_clear_frame_reg(bt, fr, i); + continue; + } + if (reg->precise) { + bt_clear_frame_reg(bt, fr, i); + } else { + reg->precise = true; + *changed = true; + } + } + + bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); + for_each_set_bit(i, mask, 64) { + if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE, + env, "stack slot %d, total slots %d", + i, func->allocated_stack / BPF_REG_SIZE)) + return -EFAULT; + + if (!bpf_is_spilled_scalar_reg(&func->stack[i])) { + bt_clear_frame_slot(bt, fr, i); + continue; + } + reg = &func->stack[i].spilled_ptr; + if (reg->precise) { + bt_clear_frame_slot(bt, fr, i); + } else { + reg->precise = true; + *changed = true; + } + } + if (env->log.level & BPF_LOG_LEVEL2) { + fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, + bt_frame_reg_mask(bt, fr)); + verbose(env, "mark_precise: frame%d: parent state regs=%s ", + fr, env->tmp_str_buf); + bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, + bt_frame_stack_mask(bt, fr)); + verbose(env, "stack=%s: ", env->tmp_str_buf); + print_verifier_state(env, st, fr, true); + } + } + + if (bt_empty(bt)) + return 0; + + subseq_idx = first_idx; + last_idx = st->last_insn_idx; + first_idx = st->first_insn_idx; + } + + /* if we still have requested precise regs or slots, we missed + * something (e.g., stack access through non-r10 register), so + * fallback to marking all precise + */ + if (!bt_empty(bt)) { + bpf_mark_all_scalars_precise(env, starting_state); + bt_reset(bt); + } + + return 0; +} diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 35e1ddca74d2..b73336c976b7 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -180,7 +180,7 @@ static long bloom_map_update_elem(struct bpf_map *map, void *key, return -EINVAL; } -static int bloom_map_check_btf(const struct bpf_map *map, +static int bloom_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index c2a2ead1f466..c76e9b0fabba 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -76,7 +76,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, return PTR_ERR(cgroup); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, - value, map_flags, false, GFP_ATOMIC); + value, map_flags, false); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); } @@ -114,7 +114,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { - return bpf_local_storage_map_alloc(attr, &cgroup_cache, true); + return bpf_local_storage_map_alloc(attr, &cgroup_cache); } static void cgroup_storage_map_free(struct bpf_map *map) @@ -122,9 +122,8 @@ static void cgroup_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(map, &cgroup_cache); } -/* *gfp_flags* is a hidden argument provided by the verifier */ -BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, - void *, value, u64, flags, gfp_t, gfp_flags) +BPF_CALL_4(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, + void *, value, u64, flags) { struct bpf_local_storage_data *sdata; @@ -143,7 +142,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!percpu_ref_is_dying(&cgroup->self.refcnt) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, - value, BPF_NOEXIST, false, gfp_flags); + value, BPF_NOEXIST, false); out: return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e86734609f3d..0da8d923e39d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -98,7 +98,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update(file_inode(fd_file(f)), (struct bpf_local_storage_map *)map, - value, map_flags, false, GFP_ATOMIC); + value, map_flags, false); return PTR_ERR_OR_ZERO(sdata); } @@ -122,9 +122,8 @@ static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) return inode_storage_delete(file_inode(fd_file(f)), map); } -/* *gfp_flags* is a hidden argument provided by the verifier */ -BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, - void *, value, u64, flags, gfp_t, gfp_flags) +BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, + void *, value, u64, flags) { struct bpf_local_storage_data *sdata; @@ -150,7 +149,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, false, gfp_flags); + BPF_NOEXIST, false); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } @@ -179,7 +178,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) { - return bpf_local_storage_map_alloc(attr, &inode_cache, false); + return bpf_local_storage_map_alloc(attr, &inode_cache); } static void inode_storage_map_free(struct bpf_map *map) diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c index c0286f25ca3c..a2f84afe6f7c 100644 --- a/kernel/bpf/bpf_insn_array.c +++ b/kernel/bpf/bpf_insn_array.c @@ -98,7 +98,7 @@ static long insn_array_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } -static int insn_array_check_btf(const struct bpf_map *map, +static int insn_array_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b28f07d3a0db..6fc6a4b672b5 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -68,25 +68,19 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool swap_uptrs, gfp_t gfp_flags) + void *value, bool swap_uptrs) { struct bpf_local_storage_elem *selem; if (mem_charge(smap, owner, smap->elem_size)) return NULL; - if (smap->use_kmalloc_nolock) { - selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size, - __GFP_ZERO, NUMA_NO_NODE); - } else { - selem = bpf_map_kzalloc(&smap->map, smap->elem_size, - gfp_flags | __GFP_NOWARN); - } + selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size, + __GFP_ZERO, NUMA_NO_NODE); if (selem) { RCU_INIT_POINTER(SDATA(selem)->smap, smap); atomic_set(&selem->state, 0); - selem->use_kmalloc_nolock = smap->use_kmalloc_nolock; if (value) { /* No need to call check_and_init_map_value as memory is zero init */ @@ -102,46 +96,16 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; } -/* rcu tasks trace callback for use_kmalloc_nolock == false */ -static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) +static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; - /* If RCU Tasks Trace grace period implies RCU grace period, do - * kfree(), else do kfree_rcu(). + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. */ local_storage = container_of(rcu, struct bpf_local_storage, rcu); - if (rcu_trace_implies_rcu_gp()) - kfree(local_storage); - else - kfree_rcu(local_storage, rcu); -} - -/* Handle use_kmalloc_nolock == false */ -static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, - bool vanilla_rcu) -{ - if (vanilla_rcu) - kfree_rcu(local_storage, rcu); - else - call_rcu_tasks_trace(&local_storage->rcu, - __bpf_local_storage_free_trace_rcu); -} - -static void bpf_local_storage_free_rcu(struct rcu_head *rcu) -{ - struct bpf_local_storage *local_storage; - - local_storage = container_of(rcu, struct bpf_local_storage, rcu); - kfree_nolock(local_storage); -} - -static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) -{ - if (rcu_trace_implies_rcu_gp()) - bpf_local_storage_free_rcu(rcu); - else - call_rcu(rcu, bpf_local_storage_free_rcu); + kfree(local_storage); } static void bpf_local_storage_free(struct bpf_local_storage *local_storage, @@ -150,13 +114,8 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage, if (!local_storage) return; - if (!local_storage->use_kmalloc_nolock) { - __bpf_local_storage_free(local_storage, reuse_now); - return; - } - if (reuse_now) { - call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); + kfree_rcu(local_storage, rcu); return; } @@ -164,29 +123,7 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage, bpf_local_storage_free_trace_rcu); } -/* rcu tasks trace callback for use_kmalloc_nolock == false */ -static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) -{ - struct bpf_local_storage_elem *selem; - - selem = container_of(rcu, struct bpf_local_storage_elem, rcu); - if (rcu_trace_implies_rcu_gp()) - kfree(selem); - else - kfree_rcu(selem, rcu); -} - -/* Handle use_kmalloc_nolock == false */ -static void __bpf_selem_free(struct bpf_local_storage_elem *selem, - bool vanilla_rcu) -{ - if (vanilla_rcu) - kfree_rcu(selem, rcu); - else - call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu); -} - -static void bpf_selem_free_rcu(struct rcu_head *rcu) +static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; struct bpf_local_storage_map *smap; @@ -195,20 +132,13 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); - if (smap) { - migrate_disable(); + if (smap) bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - migrate_enable(); - } - kfree_nolock(selem); -} - -static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) -{ - if (rcu_trace_implies_rcu_gp()) - bpf_selem_free_rcu(rcu); - else - call_rcu(rcu, bpf_selem_free_rcu); + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. + */ + kfree(selem); } void bpf_selem_free(struct bpf_local_storage_elem *selem, @@ -216,26 +146,12 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, { struct bpf_local_storage_map *smap; - smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); + smap = rcu_dereference_check(SDATA(selem)->smap, 1); - if (!selem->use_kmalloc_nolock) { - /* - * No uptr will be unpin even when reuse_now == false since uptr - * is only supported in task local storage, where - * smap->use_kmalloc_nolock == true. - */ + if (reuse_now) { if (smap) bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - __bpf_selem_free(selem, reuse_now); - return; - } - - if (reuse_now) { - /* - * While it is okay to call bpf_obj_free_fields() that unpins uptr when - * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity. - */ - call_rcu(&selem->rcu, bpf_selem_free_rcu); + kfree_rcu(selem, rcu); return; } @@ -389,6 +305,9 @@ int bpf_selem_unlink(struct bpf_local_storage_elem *selem) unsigned long flags; int err; + if (in_nmi()) + return -EOPNOTSUPP; + if (unlikely(!selem_linked_to_storage_lockless(selem))) /* selem has already been unlinked from sk */ return 0; @@ -490,6 +409,14 @@ static void bpf_selem_unlink_nofail(struct bpf_local_storage_elem *selem, } raw_res_spin_unlock_irqrestore(&local_storage->lock, flags); } + /* + * Highly unlikely scenario: memory leak + * + * When destroy() fails to acqurire local_storage->lock and initializes + * selem->local_storage to NULL before any racing map_free() sees the same + * selem, no one will free the local storage. + */ + WARN_ON_ONCE(err && !in_map_free); if (!err || !in_map_free) RCU_INIT_POINTER(selem->local_storage, NULL); } @@ -548,8 +475,7 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata, int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *first_selem, - gfp_t gfp_flags) + struct bpf_local_storage_elem *first_selem) { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; @@ -561,12 +487,8 @@ int bpf_local_storage_alloc(void *owner, if (err) return err; - if (smap->use_kmalloc_nolock) - storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage), - __GFP_ZERO, NUMA_NO_NODE); - else - storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), - gfp_flags | __GFP_NOWARN); + storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage), + __GFP_ZERO, NUMA_NO_NODE); if (!storage) { err = -ENOMEM; goto uncharge; @@ -576,7 +498,6 @@ int bpf_local_storage_alloc(void *owner, raw_res_spin_lock_init(&storage->lock); storage->owner = owner; storage->mem_charge = sizeof(*storage); - storage->use_kmalloc_nolock = smap->use_kmalloc_nolock; refcount_set(&storage->owner_refcnt, 1); bpf_selem_link_storage_nolock(storage, first_selem); @@ -624,7 +545,7 @@ uncharge: */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags) + void *value, u64 map_flags, bool swap_uptrs) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; @@ -641,9 +562,6 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))) return ERR_PTR(-EINVAL); - if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) - return ERR_PTR(-EINVAL); - local_storage = rcu_dereference_check(*owner_storage(smap, owner), bpf_rcu_lock_held()); if (!local_storage || hlist_empty(&local_storage->list)) { @@ -652,11 +570,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags); + selem = bpf_selem_alloc(smap, owner, value, swap_uptrs); if (!selem) return ERR_PTR(-ENOMEM); - err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); + err = bpf_local_storage_alloc(owner, smap, selem); if (err) { bpf_selem_free(selem, true); mem_uncharge(smap, owner, smap->elem_size); @@ -686,7 +604,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ - alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags); + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs); if (!alloc_selem) return ERR_PTR(-ENOMEM); @@ -797,7 +715,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) return 0; } -int bpf_local_storage_map_check_btf(const struct bpf_map *map, +int bpf_local_storage_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) @@ -853,8 +771,7 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, - struct bpf_local_storage_cache *cache, - bool use_kmalloc_nolock) + struct bpf_local_storage_cache *cache) { struct bpf_local_storage_map *smap; unsigned int i; @@ -886,12 +803,6 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, smap->elem_size = offsetof(struct bpf_local_storage_elem, sdata.data[attr->value_size]); - /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non - * preemptible context. Thus, enforce all storages to use - * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled. - */ - smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock; - smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; @@ -958,10 +869,9 @@ restart: */ synchronize_rcu(); - if (smap->use_kmalloc_nolock) { - rcu_barrier_tasks_trace(); - rcu_barrier(); - } + /* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */ + rcu_barrier_tasks_trace(); + rcu_barrier(); kvfree(smap->buckets); bpf_map_area_free(smap); } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 0c4a0c8e6f70..c5c925f00202 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -359,8 +359,6 @@ BTF_ID(func, bpf_lsm_sb_umount) BTF_ID(func, bpf_lsm_settime) #ifdef CONFIG_SECURITY_NETWORK -BTF_ID(func, bpf_lsm_inet_conn_established) - BTF_ID(func, bpf_lsm_socket_accept) BTF_ID(func, bpf_lsm_socket_bind) BTF_ID(func, bpf_lsm_socket_connect) @@ -381,8 +379,9 @@ BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) -BTF_ID(func, bpf_lsm_task_to_inode) BTF_ID(func, bpf_lsm_userns_create) +BTF_ID(func, bpf_lsm_bdev_alloc_security) +BTF_ID(func, bpf_lsm_bdev_setintegrity) BTF_SET_END(sleepable_lsm_hooks) BTF_SET_START(untrusted_lsm_hooks) @@ -395,6 +394,8 @@ BTF_ID(func, bpf_lsm_sk_alloc_security) BTF_ID(func, bpf_lsm_sk_free_security) #endif /* CONFIG_SECURITY_NETWORK */ BTF_ID(func, bpf_lsm_task_free) +BTF_ID(func, bpf_lsm_bdev_alloc_security) +BTF_ID(func, bpf_lsm_bdev_free_security) BTF_SET_END(untrusted_lsm_hooks) bool bpf_lsm_is_sleepable_hook(u32 btf_id) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 05b366b821c3..521cb9d7e8c7 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -811,9 +811,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } - /* Poison pointer on error instead of return for backward compatibility */ - bpf_prog_assoc_struct_ops(prog, &st_map->map); - link = kzalloc_obj(*link, GFP_USER); if (!link) { bpf_prog_put(prog); @@ -824,6 +821,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, &bpf_struct_ops_link_lops, prog, prog->expected_attach_type); *plink++ = &link->link; + /* Poison pointer on error instead of return for backward compatibility */ + bpf_prog_assoc_struct_ops(prog, &st_map->map); + ksym = kzalloc_obj(*ksym, GFP_USER); if (!ksym) { err = -ENOMEM; @@ -906,6 +906,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, reset_unlock: bpf_struct_ops_map_free_ksyms(st_map); bpf_struct_ops_map_free_image(st_map); + bpf_struct_ops_map_dissoc_progs(st_map); bpf_struct_ops_map_put_progs(st_map); memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 605506792b5b..4b342be29eac 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -118,7 +118,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, - true, GFP_ATOMIC); + true); err = PTR_ERR_OR_ZERO(sdata); out: @@ -165,9 +165,8 @@ out: return err; } -/* *gfp_flags* is a hidden argument provided by the verifier */ -BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags, gfp_t, gfp_flags) +BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags) { struct bpf_local_storage_data *sdata; @@ -184,7 +183,7 @@ BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, false, gfp_flags); + BPF_NOEXIST, false); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } @@ -212,7 +211,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) { - return bpf_local_storage_map_alloc(attr, &task_cache, true); + return bpf_local_storage_map_alloc(attr, &task_cache); } static void task_storage_map_free(struct bpf_map *map) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4872d2a6c42d..a62d78581207 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -270,6 +270,7 @@ struct btf { struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab; struct btf_struct_metas *struct_meta_tab; struct btf_struct_ops_tab *struct_ops_tab; + struct btf_layout *layout; /* split BTF support */ struct btf *base_btf; @@ -1707,6 +1708,11 @@ static void btf_verifier_log_hdr(struct btf_verifier_env *env, __btf_verifier_log(log, "type_len: %u\n", hdr->type_len); __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); + if (hdr->hdr_len >= sizeof(struct btf_header) && + btf_data_size >= hdr->hdr_len) { + __btf_verifier_log(log, "layout_off: %u\n", hdr->layout_off); + __btf_verifier_log(log, "layout_len: %u\n", hdr->layout_len); + } __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size); } @@ -1787,7 +1793,16 @@ static void btf_free_id(struct btf *btf) * of the _bh() version. */ spin_lock_irqsave(&btf_idr_lock, flags); - idr_remove(&btf_idr, btf->id); + if (btf->id) { + idr_remove(&btf_idr, btf->id); + /* + * Clear the id here to make this function idempotent, since it will get + * called a couple of times for module BTFs: on module unload, and then + * the final btf_put(). btf_alloc_id() starts IDs with 1, so we can use + * 0 as sentinel value. + */ + WRITE_ONCE(btf->id, 0); + } spin_unlock_irqrestore(&btf_idr_lock, flags); } @@ -5517,7 +5532,8 @@ static int btf_parse_str_sec(struct btf_verifier_env *env) start = btf->nohdr_data + hdr->str_off; end = start + hdr->str_len; - if (end != btf->data + btf->data_size) { + if (hdr->hdr_len < sizeof(struct btf_header) && + end != btf->data + btf->data_size) { btf_verifier_log(env, "String section is not at the end"); return -EINVAL; } @@ -5538,9 +5554,46 @@ static int btf_parse_str_sec(struct btf_verifier_env *env) return 0; } +static int btf_parse_layout_sec(struct btf_verifier_env *env) +{ + const struct btf_header *hdr = &env->btf->hdr; + struct btf *btf = env->btf; + void *start, *end; + + if (hdr->hdr_len < sizeof(struct btf_header) || + hdr->layout_len == 0) + return 0; + + /* Layout section must align to 4 bytes */ + if (hdr->layout_off & (sizeof(u32) - 1)) { + btf_verifier_log(env, "Unaligned layout_off"); + return -EINVAL; + } + start = btf->nohdr_data + hdr->layout_off; + end = start + hdr->layout_len; + + if (hdr->layout_len < sizeof(struct btf_layout)) { + btf_verifier_log(env, "Layout section is too small"); + return -EINVAL; + } + if (hdr->layout_len % sizeof(struct btf_layout) != 0) { + btf_verifier_log(env, "layout_len is not multiple of %zu", + sizeof(struct btf_layout)); + return -EINVAL; + } + if (end > btf->data + btf->data_size) { + btf_verifier_log(env, "Layout section is too big"); + return -EINVAL; + } + btf->layout = start; + + return 0; +} + static const size_t btf_sec_info_offset[] = { offsetof(struct btf_header, type_off), offsetof(struct btf_header, str_off), + offsetof(struct btf_header, layout_off) }; static int btf_sec_info_cmp(const void *a, const void *b) @@ -5556,24 +5609,28 @@ static int btf_check_sec_info(struct btf_verifier_env *env, { struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)]; u32 total, expected_total, i; + u32 nr_secs = ARRAY_SIZE(btf_sec_info_offset); const struct btf_header *hdr; const struct btf *btf; btf = env->btf; hdr = &btf->hdr; + if (hdr->hdr_len < sizeof(struct btf_header) || hdr->layout_len == 0) + nr_secs--; + /* Populate the secs from hdr */ - for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) + for (i = 0; i < nr_secs; i++) secs[i] = *(struct btf_sec_info *)((void *)hdr + btf_sec_info_offset[i]); - sort(secs, ARRAY_SIZE(btf_sec_info_offset), + sort(secs, nr_secs, sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL); /* Check for gaps and overlap among sections */ total = 0; expected_total = btf_data_size - hdr->hdr_len; - for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) { + for (i = 0; i < nr_secs; i++) { if (expected_total < secs[i].off) { btf_verifier_log(env, "Invalid section offset"); return -EINVAL; @@ -5929,6 +5986,10 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat if (err) goto errout; + err = btf_parse_layout_sec(env); + if (err) + goto errout; + err = btf_parse_type_sec(env); if (err) goto errout; @@ -6508,13 +6569,6 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) return prog->aux->attach_btf; } -static bool is_void_or_int_ptr(struct btf *btf, const struct btf_type *t) -{ - /* skip modifiers */ - t = btf_type_skip_modifiers(btf, t->type, NULL); - return btf_type_is_void(t) || btf_type_is_int(t); -} - u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off) { @@ -6903,10 +6957,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } /* - * If it's a pointer to void, it's the same as scalar from the verifier - * safety POV. Either way, no futher pointer walking is allowed. + * If it's a single or multilevel pointer, except a pointer + * to a structure, it's the same as scalar from the verifier + * safety POV. Multilevel pointers to structures are treated as + * scalars. The verifier lacks the context to infer the size of + * their target memory regions. Either way, no further pointer + * walking is allowed. */ - if (is_void_or_int_ptr(btf, t)) + if (!btf_type_is_struct_ptr(btf, t)) return true; /* this is a pointer to another type */ @@ -7836,15 +7894,16 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } - /* check that function returns int, exception cb also requires this */ + /* check that function is void or returns int, exception cb also requires this */ t = btf_type_by_id(btf, t->type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (!btf_type_is_int(t) && !btf_is_any_enum(t)) { + if (!btf_type_is_void(t) && !btf_type_is_int(t) && !btf_is_any_enum(t)) { if (!is_global) return -EINVAL; bpf_log(log, - "Global function %s() doesn't return scalar. Only those are supported.\n", + "Global function %s() return value not void or scalar. " + "Only those are supported.\n", tname); return -EINVAL; } @@ -8115,7 +8174,7 @@ static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp) { const struct btf *btf = filp->private_data; - seq_printf(m, "btf_id:\t%u\n", btf->id); + seq_printf(m, "btf_id:\t%u\n", READ_ONCE(btf->id)); } #endif @@ -8197,7 +8256,7 @@ int btf_get_info_by_fd(const struct btf *btf, if (copy_from_user(&info, uinfo, info_copy)) return -EFAULT; - info.id = btf->id; + info.id = READ_ONCE(btf->id); ubtf = u64_to_user_ptr(info.btf); btf_copy = min_t(u32, btf->data_size, info.btf_size); if (copy_to_user(ubtf, btf->data, btf_copy)) @@ -8260,7 +8319,7 @@ int btf_get_fd_by_id(u32 id) u32 btf_obj_id(const struct btf *btf) { - return btf->id; + return READ_ONCE(btf->id); } bool btf_is_kernel(const struct btf *btf) @@ -8382,6 +8441,13 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, if (btf_mod->module != module) continue; + /* + * For modules, we do the freeing of BTF IDR as soon as + * module goes away to disable BTF discovery, since the + * btf_try_get_module() on such BTFs will fail. This may + * be called again on btf_put(), but it's ok to do so. + */ + btf_free_id(btf_mod->btf); list_del(&btf_mod->list); if (btf_mod->sysfs_attr) sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); @@ -9003,7 +9069,7 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc if (!t || !btf_type_is_ptr(t)) return -EINVAL; - if (IS_ENABLED(CONFIG_CFI_CLANG)) { + if (IS_ENABLED(CONFIG_CFI)) { /* Ensure the destructor kfunc type matches btf_dtor_kfunc_t */ t = btf_type_by_id(btf, t->type); if (!btf_type_is_void(t)) diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c new file mode 100644 index 000000000000..998f42a8189a --- /dev/null +++ b/kernel/bpf/cfg.c @@ -0,0 +1,872 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> +#include <linux/filter.h> +#include <linux/sort.h> + +#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) + +/* non-recursive DFS pseudo code + * 1 procedure DFS-iterative(G,v): + * 2 label v as discovered + * 3 let S be a stack + * 4 S.push(v) + * 5 while S is not empty + * 6 t <- S.peek() + * 7 if t is what we're looking for: + * 8 return t + * 9 for all edges e in G.adjacentEdges(t) do + * 10 if edge e is already labelled + * 11 continue with the next edge + * 12 w <- G.adjacentVertex(t,e) + * 13 if vertex w is not discovered and not explored + * 14 label e as tree-edge + * 15 label w as discovered + * 16 S.push(w) + * 17 continue at 5 + * 18 else if vertex w is discovered + * 19 label e as back-edge + * 20 else + * 21 // vertex w is explored + * 22 label e as forward- or cross-edge + * 23 label t as explored + * 24 S.pop() + * + * convention: + * 0x10 - discovered + * 0x11 - discovered and fall-through edge labelled + * 0x12 - discovered and fall-through and branch edges labelled + * 0x20 - explored + */ + +enum { + DISCOVERED = 0x10, + EXPLORED = 0x20, + FALLTHROUGH = 1, + BRANCH = 2, +}; + + +static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = bpf_find_containing_subprog(env, off); + subprog->changes_pkt_data = true; +} + +static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = bpf_find_containing_subprog(env, off); + subprog->might_sleep = true; +} + +/* 't' is an index of a call-site. + * 'w' is a callee entry point. + * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. + * Rely on DFS traversal order and absence of recursive calls to guarantee that + * callee's change_pkt_data marks would be correct at that moment. + */ +static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) +{ + struct bpf_subprog_info *caller, *callee; + + caller = bpf_find_containing_subprog(env, t); + callee = bpf_find_containing_subprog(env, w); + caller->changes_pkt_data |= callee->changes_pkt_data; + caller->might_sleep |= callee->might_sleep; +} + +enum { + DONE_EXPLORING = 0, + KEEP_EXPLORING = 1, +}; + +/* t, w, e - match pseudo-code above: + * t - index of current instruction + * w - next instruction + * e - edge + */ +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) +{ + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + + if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) + return DONE_EXPLORING; + + if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) + return DONE_EXPLORING; + + if (w < 0 || w >= env->prog->len) { + verbose_linfo(env, t, "%d: ", t); + verbose(env, "jump out of range from insn %d to %d\n", t, w); + return -EINVAL; + } + + if (e == BRANCH) { + /* mark branch target for state pruning */ + mark_prune_point(env, w); + mark_jmp_point(env, w); + } + + if (insn_state[w] == 0) { + /* tree-edge */ + insn_state[t] = DISCOVERED | e; + insn_state[w] = DISCOVERED; + if (env->cfg.cur_stack >= env->prog->len) + return -E2BIG; + insn_stack[env->cfg.cur_stack++] = w; + return KEEP_EXPLORING; + } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + if (env->bpf_capable) + return DONE_EXPLORING; + verbose_linfo(env, t, "%d: ", t); + verbose_linfo(env, w, "%d: ", w); + verbose(env, "back-edge from insn %d to %d\n", t, w); + return -EINVAL; + } else if (insn_state[w] == EXPLORED) { + /* forward- or cross-edge */ + insn_state[t] = DISCOVERED | e; + } else { + verifier_bug(env, "insn state internal bug"); + return -EFAULT; + } + return DONE_EXPLORING; +} + +static int visit_func_call_insn(int t, struct bpf_insn *insns, + struct bpf_verifier_env *env, + bool visit_callee) +{ + int ret, insn_sz; + int w; + + insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; + ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); + if (ret) + return ret; + + mark_prune_point(env, t + insn_sz); + /* when we exit from subprog, we need to record non-linear history */ + mark_jmp_point(env, t + insn_sz); + + if (visit_callee) { + w = t + insns[t].imm + 1; + mark_prune_point(env, t); + merge_callee_effects(env, t, w); + ret = push_insn(t, w, BRANCH, env); + } + return ret; +} + +struct bpf_iarray *bpf_iarray_realloc(struct bpf_iarray *old, size_t n_elem) +{ + size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]); + struct bpf_iarray *new; + + new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT); + if (!new) { + /* this is what callers always want, so simplify the call site */ + kvfree(old); + return NULL; + } + + new->cnt = n_elem; + return new; +} + +static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items) +{ + struct bpf_insn_array_value *value; + u32 i; + + for (i = start; i <= end; i++) { + value = map->ops->map_lookup_elem(map, &i); + /* + * map_lookup_elem of an array map will never return an error, + * but not checking it makes some static analysers to worry + */ + if (IS_ERR(value)) + return PTR_ERR(value); + else if (!value) + return -EINVAL; + items[i - start] = value->xlated_off; + } + return 0; +} + +static int cmp_ptr_to_u32(const void *a, const void *b) +{ + return *(u32 *)a - *(u32 *)b; +} + +static int sort_insn_array_uniq(u32 *items, int cnt) +{ + int unique = 1; + int i; + + sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL); + + for (i = 1; i < cnt; i++) + if (items[i] != items[unique - 1]) + items[unique++] = items[i]; + + return unique; +} + +/* + * sort_unique({map[start], ..., map[end]}) into off + */ +int bpf_copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off) +{ + u32 n = end - start + 1; + int err; + + err = copy_insn_array(map, start, end, off); + if (err) + return err; + + return sort_insn_array_uniq(off, n); +} + +/* + * Copy all unique offsets from the map + */ +static struct bpf_iarray *jt_from_map(struct bpf_map *map) +{ + struct bpf_iarray *jt; + int err; + int n; + + jt = bpf_iarray_realloc(NULL, map->max_entries); + if (!jt) + return ERR_PTR(-ENOMEM); + + n = bpf_copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items); + if (n < 0) { + err = n; + goto err_free; + } + if (n == 0) { + err = -EINVAL; + goto err_free; + } + jt->cnt = n; + return jt; + +err_free: + kvfree(jt); + return ERR_PTR(err); +} + +/* + * Find and collect all maps which fit in the subprog. Return the result as one + * combined jump table in jt->items (allocated with kvcalloc) + */ +static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env, + int subprog_start, int subprog_end) +{ + struct bpf_iarray *jt = NULL; + struct bpf_map *map; + struct bpf_iarray *jt_cur; + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) { + /* + * TODO (when needed): collect only jump tables, not static keys + * or maps for indirect calls + */ + map = env->insn_array_maps[i]; + + jt_cur = jt_from_map(map); + if (IS_ERR(jt_cur)) { + kvfree(jt); + return jt_cur; + } + + /* + * This is enough to check one element. The full table is + * checked to fit inside the subprog later in create_jt() + */ + if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) { + u32 old_cnt = jt ? jt->cnt : 0; + jt = bpf_iarray_realloc(jt, old_cnt + jt_cur->cnt); + if (!jt) { + kvfree(jt_cur); + return ERR_PTR(-ENOMEM); + } + memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2); + } + + kvfree(jt_cur); + } + + if (!jt) { + verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start); + return ERR_PTR(-EINVAL); + } + + jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt); + return jt; +} + +static struct bpf_iarray * +create_jt(int t, struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprog; + int subprog_start, subprog_end; + struct bpf_iarray *jt; + int i; + + subprog = bpf_find_containing_subprog(env, t); + subprog_start = subprog->start; + subprog_end = (subprog + 1)->start; + jt = jt_from_subprog(env, subprog_start, subprog_end); + if (IS_ERR(jt)) + return jt; + + /* Check that the every element of the jump table fits within the given subprogram */ + for (i = 0; i < jt->cnt; i++) { + if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) { + verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n", + t, subprog_start, subprog_end); + kvfree(jt); + return ERR_PTR(-EINVAL); + } + } + + return jt; +} + +/* "conditional jump with N edges" */ +static int visit_gotox_insn(int t, struct bpf_verifier_env *env) +{ + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + bool keep_exploring = false; + struct bpf_iarray *jt; + int i, w; + + jt = env->insn_aux_data[t].jt; + if (!jt) { + jt = create_jt(t, env); + if (IS_ERR(jt)) + return PTR_ERR(jt); + + env->insn_aux_data[t].jt = jt; + } + + mark_prune_point(env, t); + for (i = 0; i < jt->cnt; i++) { + w = jt->items[i]; + if (w < 0 || w >= env->prog->len) { + verbose(env, "indirect jump out of range from insn %d to %d\n", t, w); + return -EINVAL; + } + + mark_jmp_point(env, w); + + /* EXPLORED || DISCOVERED */ + if (insn_state[w]) + continue; + + if (env->cfg.cur_stack >= env->prog->len) + return -E2BIG; + + insn_stack[env->cfg.cur_stack++] = w; + insn_state[w] |= DISCOVERED; + keep_exploring = true; + } + + return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING; +} + +/* + * Instructions that can abnormally return from a subprog (tail_call + * upon success, ld_{abs,ind} upon load failure) have a hidden exit + * that the verifier must account for. + */ +static int visit_abnormal_return_insn(struct bpf_verifier_env *env, int t) +{ + struct bpf_subprog_info *subprog; + struct bpf_iarray *jt; + + if (env->insn_aux_data[t].jt) + return 0; + + jt = bpf_iarray_realloc(NULL, 2); + if (!jt) + return -ENOMEM; + + subprog = bpf_find_containing_subprog(env, t); + jt->items[0] = t + 1; + jt->items[1] = subprog->exit_idx; + env->insn_aux_data[t].jt = jt; + return 0; +} + +/* Visits the instruction at index t and returns one of the following: + * < 0 - an error occurred + * DONE_EXPLORING - the instruction was fully explored + * KEEP_EXPLORING - there is still work to be done before it is fully explored + */ +static int visit_insn(int t, struct bpf_verifier_env *env) +{ + struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; + int ret, off, insn_sz; + + if (bpf_pseudo_func(insn)) + return visit_func_call_insn(t, insns, env, true); + + /* All non-branch instructions have a single fall-through edge. */ + if (BPF_CLASS(insn->code) != BPF_JMP && + BPF_CLASS(insn->code) != BPF_JMP32) { + if (BPF_CLASS(insn->code) == BPF_LD && + (BPF_MODE(insn->code) == BPF_ABS || + BPF_MODE(insn->code) == BPF_IND)) { + ret = visit_abnormal_return_insn(env, t); + if (ret) + return ret; + } + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + return push_insn(t, t + insn_sz, FALLTHROUGH, env); + } + + switch (BPF_OP(insn->code)) { + case BPF_EXIT: + return DONE_EXPLORING; + + case BPF_CALL: + if (bpf_is_async_callback_calling_insn(insn)) + /* Mark this call insn as a prune point to trigger + * is_state_visited() check before call itself is + * processed by __check_func_call(). Otherwise new + * async state will be pushed for further exploration. + */ + mark_prune_point(env, t); + /* For functions that invoke callbacks it is not known how many times + * callback would be called. Verifier models callback calling functions + * by repeatedly visiting callback bodies and returning to origin call + * instruction. + * In order to stop such iteration verifier needs to identify when a + * state identical some state from a previous iteration is reached. + * Check below forces creation of checkpoint before callback calling + * instruction to allow search for such identical states. + */ + if (bpf_is_sync_callback_calling_insn(insn)) { + mark_calls_callback(env, t); + mark_force_checkpoint(env, t); + mark_prune_point(env, t); + mark_jmp_point(env, t); + } + if (bpf_helper_call(insn)) { + const struct bpf_func_proto *fp; + + ret = bpf_get_helper_proto(env, insn->imm, &fp); + /* If called in a non-sleepable context program will be + * rejected anyway, so we should end up with precise + * sleepable marks on subprogs, except for dead code + * elimination. + */ + if (ret == 0 && fp->might_sleep) + mark_subprog_might_sleep(env, t); + if (bpf_helper_changes_pkt_data(insn->imm)) + mark_subprog_changes_pkt_data(env, t); + if (insn->imm == BPF_FUNC_tail_call) { + ret = visit_abnormal_return_insn(env, t); + if (ret) + return ret; + } + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + struct bpf_kfunc_call_arg_meta meta; + + ret = bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta); + if (ret == 0 && bpf_is_iter_next_kfunc(&meta)) { + mark_prune_point(env, t); + /* Checking and saving state checkpoints at iter_next() call + * is crucial for fast convergence of open-coded iterator loop + * logic, so we need to force it. If we don't do that, + * is_state_visited() might skip saving a checkpoint, causing + * unnecessarily long sequence of not checkpointed + * instructions and jumps, leading to exhaustion of jump + * history buffer, and potentially other undesired outcomes. + * It is expected that with correct open-coded iterators + * convergence will happen quickly, so we don't run a risk of + * exhausting memory. + */ + mark_force_checkpoint(env, t); + } + /* Same as helpers, if called in a non-sleepable context + * program will be rejected anyway, so we should end up + * with precise sleepable marks on subprogs, except for + * dead code elimination. + */ + if (ret == 0 && bpf_is_kfunc_sleepable(&meta)) + mark_subprog_might_sleep(env, t); + if (ret == 0 && bpf_is_kfunc_pkt_changing(&meta)) + mark_subprog_changes_pkt_data(env, t); + } + return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); + + case BPF_JA: + if (BPF_SRC(insn->code) == BPF_X) + return visit_gotox_insn(t, env); + + if (BPF_CLASS(insn->code) == BPF_JMP) + off = insn->off; + else + off = insn->imm; + + /* unconditional jump with single edge */ + ret = push_insn(t, t + off + 1, FALLTHROUGH, env); + if (ret) + return ret; + + mark_prune_point(env, t + off + 1); + mark_jmp_point(env, t + off + 1); + + return ret; + + default: + /* conditional jump with two edges */ + mark_prune_point(env, t); + if (bpf_is_may_goto_insn(insn)) + mark_force_checkpoint(env, t); + + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret) + return ret; + + return push_insn(t, t + insn->off + 1, BRANCH, env); + } +} + +/* non-recursive depth-first-search to detect loops in BPF program + * loop == back-edge in directed graph + */ +int bpf_check_cfg(struct bpf_verifier_env *env) +{ + int insn_cnt = env->prog->len; + int *insn_stack, *insn_state; + int ex_insn_beg, i, ret = 0; + + insn_state = env->cfg.insn_state = kvzalloc_objs(int, insn_cnt, + GFP_KERNEL_ACCOUNT); + if (!insn_state) + return -ENOMEM; + + insn_stack = env->cfg.insn_stack = kvzalloc_objs(int, insn_cnt, + GFP_KERNEL_ACCOUNT); + if (!insn_stack) { + kvfree(insn_state); + return -ENOMEM; + } + + ex_insn_beg = env->exception_callback_subprog + ? env->subprog_info[env->exception_callback_subprog].start + : 0; + + insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ + insn_stack[0] = 0; /* 0 is the first instruction */ + env->cfg.cur_stack = 1; + +walk_cfg: + while (env->cfg.cur_stack > 0) { + int t = insn_stack[env->cfg.cur_stack - 1]; + + ret = visit_insn(t, env); + switch (ret) { + case DONE_EXPLORING: + insn_state[t] = EXPLORED; + env->cfg.cur_stack--; + break; + case KEEP_EXPLORING: + break; + default: + if (ret > 0) { + verifier_bug(env, "visit_insn internal bug"); + ret = -EFAULT; + } + goto err_free; + } + } + + if (env->cfg.cur_stack < 0) { + verifier_bug(env, "pop stack internal bug"); + ret = -EFAULT; + goto err_free; + } + + if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) { + insn_state[ex_insn_beg] = DISCOVERED; + insn_stack[0] = ex_insn_beg; + env->cfg.cur_stack = 1; + goto walk_cfg; + } + + for (i = 0; i < insn_cnt; i++) { + struct bpf_insn *insn = &env->prog->insnsi[i]; + + if (insn_state[i] != EXPLORED) { + verbose(env, "unreachable insn %d\n", i); + ret = -EINVAL; + goto err_free; + } + if (bpf_is_ldimm64(insn)) { + if (insn_state[i + 1] != 0) { + verbose(env, "jump into the middle of ldimm64 insn %d\n", i); + ret = -EINVAL; + goto err_free; + } + i++; /* skip second half of ldimm64 */ + } + } + ret = 0; /* cfg looks good */ + env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; + env->prog->aux->might_sleep = env->subprog_info[0].might_sleep; + +err_free: + kvfree(insn_state); + kvfree(insn_stack); + env->cfg.insn_state = env->cfg.insn_stack = NULL; + return ret; +} + +/* + * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range + * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start) + * with indices of 'i' instructions in postorder. + */ +int bpf_compute_postorder(struct bpf_verifier_env *env) +{ + u32 cur_postorder, i, top, stack_sz, s; + int *stack = NULL, *postorder = NULL, *state = NULL; + struct bpf_iarray *succ; + + postorder = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT); + state = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT); + stack = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT); + if (!postorder || !state || !stack) { + kvfree(postorder); + kvfree(state); + kvfree(stack); + return -ENOMEM; + } + cur_postorder = 0; + for (i = 0; i < env->subprog_cnt; i++) { + env->subprog_info[i].postorder_start = cur_postorder; + stack[0] = env->subprog_info[i].start; + stack_sz = 1; + do { + top = stack[stack_sz - 1]; + state[top] |= DISCOVERED; + if (state[top] & EXPLORED) { + postorder[cur_postorder++] = top; + stack_sz--; + continue; + } + succ = bpf_insn_successors(env, top); + for (s = 0; s < succ->cnt; ++s) { + if (!state[succ->items[s]]) { + stack[stack_sz++] = succ->items[s]; + state[succ->items[s]] |= DISCOVERED; + } + } + state[top] |= EXPLORED; + } while (stack_sz); + } + env->subprog_info[i].postorder_start = cur_postorder; + env->cfg.insn_postorder = postorder; + env->cfg.cur_postorder = cur_postorder; + kvfree(stack); + kvfree(state); + return 0; +} + +/* + * Compute strongly connected components (SCCs) on the CFG. + * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc. + * If instruction is a sole member of its SCC and there are no self edges, + * assign it SCC number of zero. + * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation. + */ +int bpf_compute_scc(struct bpf_verifier_env *env) +{ + const u32 NOT_ON_STACK = U32_MAX; + + struct bpf_insn_aux_data *aux = env->insn_aux_data; + const u32 insn_cnt = env->prog->len; + int stack_sz, dfs_sz, err = 0; + u32 *stack, *pre, *low, *dfs; + u32 i, j, t, w; + u32 next_preorder_num; + u32 next_scc_id; + bool assign_scc; + struct bpf_iarray *succ; + + next_preorder_num = 1; + next_scc_id = 1; + /* + * - 'stack' accumulates vertices in DFS order, see invariant comment below; + * - 'pre[t] == p' => preorder number of vertex 't' is 'p'; + * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n'; + * - 'dfs' DFS traversal stack, used to emulate explicit recursion. + */ + stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT); + if (!stack || !pre || !low || !dfs) { + err = -ENOMEM; + goto exit; + } + /* + * References: + * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms" + * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components" + * + * The algorithm maintains the following invariant: + * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]'; + * - then, vertex 'u' remains on stack while vertex 'v' is on stack. + * + * Consequently: + * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u', + * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack, + * and thus there is an SCC (loop) containing both 'u' and 'v'. + * - If 'low[v] == pre[v]', loops containing 'v' have been explored, + * and 'v' can be considered the root of some SCC. + * + * Here is a pseudo-code for an explicitly recursive version of the algorithm: + * + * NOT_ON_STACK = insn_cnt + 1 + * pre = [0] * insn_cnt + * low = [0] * insn_cnt + * scc = [0] * insn_cnt + * stack = [] + * + * next_preorder_num = 1 + * next_scc_id = 1 + * + * def recur(w): + * nonlocal next_preorder_num + * nonlocal next_scc_id + * + * pre[w] = next_preorder_num + * low[w] = next_preorder_num + * next_preorder_num += 1 + * stack.append(w) + * for s in successors(w): + * # Note: for classic algorithm the block below should look as: + * # + * # if pre[s] == 0: + * # recur(s) + * # low[w] = min(low[w], low[s]) + * # elif low[s] != NOT_ON_STACK: + * # low[w] = min(low[w], pre[s]) + * # + * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])' + * # does not break the invariant and makes iterative version of the algorithm + * # simpler. See 'Algorithm #3' from [2]. + * + * # 's' not yet visited + * if pre[s] == 0: + * recur(s) + * # if 's' is on stack, pick lowest reachable preorder number from it; + * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]', + * # so 'min' would be a noop. + * low[w] = min(low[w], low[s]) + * + * if low[w] == pre[w]: + * # 'w' is the root of an SCC, pop all vertices + * # below 'w' on stack and assign same SCC to them. + * while True: + * t = stack.pop() + * low[t] = NOT_ON_STACK + * scc[t] = next_scc_id + * if t == w: + * break + * next_scc_id += 1 + * + * for i in range(0, insn_cnt): + * if pre[i] == 0: + * recur(i) + * + * Below implementation replaces explicit recursion with array 'dfs'. + */ + for (i = 0; i < insn_cnt; i++) { + if (pre[i]) + continue; + stack_sz = 0; + dfs_sz = 1; + dfs[0] = i; +dfs_continue: + while (dfs_sz) { + w = dfs[dfs_sz - 1]; + if (pre[w] == 0) { + low[w] = next_preorder_num; + pre[w] = next_preorder_num; + next_preorder_num++; + stack[stack_sz++] = w; + } + /* Visit 'w' successors */ + succ = bpf_insn_successors(env, w); + for (j = 0; j < succ->cnt; ++j) { + if (pre[succ->items[j]]) { + low[w] = min(low[w], low[succ->items[j]]); + } else { + dfs[dfs_sz++] = succ->items[j]; + goto dfs_continue; + } + } + /* + * Preserve the invariant: if some vertex above in the stack + * is reachable from 'w', keep 'w' on the stack. + */ + if (low[w] < pre[w]) { + dfs_sz--; + goto dfs_continue; + } + /* + * Assign SCC number only if component has two or more elements, + * or if component has a self reference, or if instruction is a + * callback calling function (implicit loop). + */ + assign_scc = stack[stack_sz - 1] != w; /* two or more elements? */ + for (j = 0; j < succ->cnt; ++j) { /* self reference? */ + if (succ->items[j] == w) { + assign_scc = true; + break; + } + } + if (bpf_calls_callback(env, w)) /* implicit loop? */ + assign_scc = true; + /* Pop component elements from stack */ + do { + t = stack[--stack_sz]; + low[t] = NOT_ON_STACK; + if (assign_scc) + aux[t].scc = next_scc_id; + } while (t != w); + if (assign_scc) + next_scc_id++; + dfs_sz--; + } + } + env->scc_info = kvzalloc_objs(*env->scc_info, next_scc_id, + GFP_KERNEL_ACCOUNT); + if (!env->scc_info) { + err = -ENOMEM; + goto exit; + } + env->scc_cnt = next_scc_id; +exit: + kvfree(stack); + kvfree(pre); + kvfree(low); + kvfree(dfs); + return err; +} diff --git a/kernel/bpf/check_btf.c b/kernel/bpf/check_btf.c new file mode 100644 index 000000000000..93bebe6fe12e --- /dev/null +++ b/kernel/bpf/check_btf.c @@ -0,0 +1,463 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> +#include <linux/filter.h> +#include <linux/btf.h> + +#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) + +static int check_abnormal_return(struct bpf_verifier_env *env) +{ + int i; + + for (i = 1; i < env->subprog_cnt; i++) { + if (env->subprog_info[i].has_ld_abs) { + verbose(env, "LD_ABS is not allowed in subprogs without BTF\n"); + return -EINVAL; + } + if (env->subprog_info[i].has_tail_call) { + verbose(env, "tail_call is not allowed in subprogs without BTF\n"); + return -EINVAL; + } + } + return 0; +} + +/* The minimum supported BTF func info size */ +#define MIN_BPF_FUNCINFO_SIZE 8 +#define MAX_FUNCINFO_REC_SIZE 252 + +static int check_btf_func_early(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + u32 krec_size = sizeof(struct bpf_func_info); + const struct btf_type *type, *func_proto; + u32 i, nfuncs, urec_size, min_size; + struct bpf_func_info *krecord; + struct bpf_prog *prog; + const struct btf *btf; + u32 prev_offset = 0; + bpfptr_t urecord; + int ret = -ENOMEM; + + nfuncs = attr->func_info_cnt; + if (!nfuncs) { + if (check_abnormal_return(env)) + return -EINVAL; + return 0; + } + + urec_size = attr->func_info_rec_size; + if (urec_size < MIN_BPF_FUNCINFO_SIZE || + urec_size > MAX_FUNCINFO_REC_SIZE || + urec_size % sizeof(u32)) { + verbose(env, "invalid func info rec size %u\n", urec_size); + return -EINVAL; + } + + prog = env->prog; + btf = prog->aux->btf; + + urecord = make_bpfptr(attr->func_info, uattr.is_kernel); + min_size = min_t(u32, krec_size, urec_size); + + krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + if (!krecord) + return -ENOMEM; + + for (i = 0; i < nfuncs; i++) { + ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); + if (ret) { + if (ret == -E2BIG) { + verbose(env, "nonzero tailing record in func info"); + /* set the size kernel expects so loader can zero + * out the rest of the record. + */ + if (copy_to_bpfptr_offset(uattr, + offsetof(union bpf_attr, func_info_rec_size), + &min_size, sizeof(min_size))) + ret = -EFAULT; + } + goto err_free; + } + + if (copy_from_bpfptr(&krecord[i], urecord, min_size)) { + ret = -EFAULT; + goto err_free; + } + + /* check insn_off */ + ret = -EINVAL; + if (i == 0) { + if (krecord[i].insn_off) { + verbose(env, + "nonzero insn_off %u for the first func info record", + krecord[i].insn_off); + goto err_free; + } + } else if (krecord[i].insn_off <= prev_offset) { + verbose(env, + "same or smaller insn offset (%u) than previous func info record (%u)", + krecord[i].insn_off, prev_offset); + goto err_free; + } + + /* check type_id */ + type = btf_type_by_id(btf, krecord[i].type_id); + if (!type || !btf_type_is_func(type)) { + verbose(env, "invalid type id %d in func info", + krecord[i].type_id); + goto err_free; + } + + func_proto = btf_type_by_id(btf, type->type); + if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto))) + /* btf_func_check() already verified it during BTF load */ + goto err_free; + + prev_offset = krecord[i].insn_off; + bpfptr_add(&urecord, urec_size); + } + + prog->aux->func_info = krecord; + prog->aux->func_info_cnt = nfuncs; + return 0; + +err_free: + kvfree(krecord); + return ret; +} + +static int check_btf_func(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + const struct btf_type *type, *func_proto, *ret_type; + u32 i, nfuncs, urec_size; + struct bpf_func_info *krecord; + struct bpf_func_info_aux *info_aux = NULL; + struct bpf_prog *prog; + const struct btf *btf; + bpfptr_t urecord; + bool scalar_return; + int ret = -ENOMEM; + + nfuncs = attr->func_info_cnt; + if (!nfuncs) { + if (check_abnormal_return(env)) + return -EINVAL; + return 0; + } + if (nfuncs != env->subprog_cnt) { + verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); + return -EINVAL; + } + + urec_size = attr->func_info_rec_size; + + prog = env->prog; + btf = prog->aux->btf; + + urecord = make_bpfptr(attr->func_info, uattr.is_kernel); + + krecord = prog->aux->func_info; + info_aux = kzalloc_objs(*info_aux, nfuncs, + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + if (!info_aux) + return -ENOMEM; + + for (i = 0; i < nfuncs; i++) { + /* check insn_off */ + ret = -EINVAL; + + if (env->subprog_info[i].start != krecord[i].insn_off) { + verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); + goto err_free; + } + + /* Already checked type_id */ + type = btf_type_by_id(btf, krecord[i].type_id); + info_aux[i].linkage = BTF_INFO_VLEN(type->info); + /* Already checked func_proto */ + func_proto = btf_type_by_id(btf, type->type); + + ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); + scalar_return = + btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type); + if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { + verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); + goto err_free; + } + if (i && !scalar_return && env->subprog_info[i].has_tail_call) { + verbose(env, "tail_call is only allowed in functions that return 'int'.\n"); + goto err_free; + } + + env->subprog_info[i].name = btf_name_by_offset(btf, type->name_off); + bpfptr_add(&urecord, urec_size); + } + + prog->aux->func_info_aux = info_aux; + return 0; + +err_free: + kfree(info_aux); + return ret; +} + +#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col) +#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE + +static int check_btf_line(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0; + struct bpf_subprog_info *sub; + struct bpf_line_info *linfo; + struct bpf_prog *prog; + const struct btf *btf; + bpfptr_t ulinfo; + int err; + + nr_linfo = attr->line_info_cnt; + if (!nr_linfo) + return 0; + if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info)) + return -EINVAL; + + rec_size = attr->line_info_rec_size; + if (rec_size < MIN_BPF_LINEINFO_SIZE || + rec_size > MAX_LINEINFO_REC_SIZE || + rec_size & (sizeof(u32) - 1)) + return -EINVAL; + + /* Need to zero it in case the userspace may + * pass in a smaller bpf_line_info object. + */ + linfo = kvzalloc_objs(struct bpf_line_info, nr_linfo, + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + if (!linfo) + return -ENOMEM; + + prog = env->prog; + btf = prog->aux->btf; + + s = 0; + sub = env->subprog_info; + ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel); + expected_size = sizeof(struct bpf_line_info); + ncopy = min_t(u32, expected_size, rec_size); + for (i = 0; i < nr_linfo; i++) { + err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size); + if (err) { + if (err == -E2BIG) { + verbose(env, "nonzero tailing record in line_info"); + if (copy_to_bpfptr_offset(uattr, + offsetof(union bpf_attr, line_info_rec_size), + &expected_size, sizeof(expected_size))) + err = -EFAULT; + } + goto err_free; + } + + if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) { + err = -EFAULT; + goto err_free; + } + + /* + * Check insn_off to ensure + * 1) strictly increasing AND + * 2) bounded by prog->len + * + * The linfo[0].insn_off == 0 check logically falls into + * the later "missing bpf_line_info for func..." case + * because the first linfo[0].insn_off must be the + * first sub also and the first sub must have + * subprog_info[0].start == 0. + */ + if ((i && linfo[i].insn_off <= prev_offset) || + linfo[i].insn_off >= prog->len) { + verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n", + i, linfo[i].insn_off, prev_offset, + prog->len); + err = -EINVAL; + goto err_free; + } + + if (!prog->insnsi[linfo[i].insn_off].code) { + verbose(env, + "Invalid insn code at line_info[%u].insn_off\n", + i); + err = -EINVAL; + goto err_free; + } + + if (!btf_name_by_offset(btf, linfo[i].line_off) || + !btf_name_by_offset(btf, linfo[i].file_name_off)) { + verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); + err = -EINVAL; + goto err_free; + } + + if (s != env->subprog_cnt) { + if (linfo[i].insn_off == sub[s].start) { + sub[s].linfo_idx = i; + s++; + } else if (sub[s].start < linfo[i].insn_off) { + verbose(env, "missing bpf_line_info for func#%u\n", s); + err = -EINVAL; + goto err_free; + } + } + + prev_offset = linfo[i].insn_off; + bpfptr_add(&ulinfo, rec_size); + } + + if (s != env->subprog_cnt) { + verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n", + env->subprog_cnt - s, s); + err = -EINVAL; + goto err_free; + } + + prog->aux->linfo = linfo; + prog->aux->nr_linfo = nr_linfo; + + return 0; + +err_free: + kvfree(linfo); + return err; +} + +#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo) +#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE + +static int check_core_relo(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + u32 i, nr_core_relo, ncopy, expected_size, rec_size; + struct bpf_core_relo core_relo = {}; + struct bpf_prog *prog = env->prog; + const struct btf *btf = prog->aux->btf; + struct bpf_core_ctx ctx = { + .log = &env->log, + .btf = btf, + }; + bpfptr_t u_core_relo; + int err; + + nr_core_relo = attr->core_relo_cnt; + if (!nr_core_relo) + return 0; + if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo)) + return -EINVAL; + + rec_size = attr->core_relo_rec_size; + if (rec_size < MIN_CORE_RELO_SIZE || + rec_size > MAX_CORE_RELO_SIZE || + rec_size % sizeof(u32)) + return -EINVAL; + + u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel); + expected_size = sizeof(struct bpf_core_relo); + ncopy = min_t(u32, expected_size, rec_size); + + /* Unlike func_info and line_info, copy and apply each CO-RE + * relocation record one at a time. + */ + for (i = 0; i < nr_core_relo; i++) { + /* future proofing when sizeof(bpf_core_relo) changes */ + err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size); + if (err) { + if (err == -E2BIG) { + verbose(env, "nonzero tailing record in core_relo"); + if (copy_to_bpfptr_offset(uattr, + offsetof(union bpf_attr, core_relo_rec_size), + &expected_size, sizeof(expected_size))) + err = -EFAULT; + } + break; + } + + if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) { + err = -EFAULT; + break; + } + + if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) { + verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n", + i, core_relo.insn_off, prog->len); + err = -EINVAL; + break; + } + + err = bpf_core_apply(&ctx, &core_relo, i, + &prog->insnsi[core_relo.insn_off / 8]); + if (err) + break; + bpfptr_add(&u_core_relo, rec_size); + } + return err; +} + +int bpf_check_btf_info_early(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + struct btf *btf; + int err; + + if (!attr->func_info_cnt && !attr->line_info_cnt) { + if (check_abnormal_return(env)) + return -EINVAL; + return 0; + } + + btf = btf_get_by_fd(attr->prog_btf_fd); + if (IS_ERR(btf)) + return PTR_ERR(btf); + if (btf_is_kernel(btf)) { + btf_put(btf); + return -EACCES; + } + env->prog->aux->btf = btf; + + err = check_btf_func_early(env, attr, uattr); + if (err) + return err; + return 0; +} + +int bpf_check_btf_info(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + int err; + + if (!attr->func_info_cnt && !attr->line_info_cnt) { + if (check_abnormal_return(env)) + return -EINVAL; + return 0; + } + + err = check_btf_func(env, attr, uattr); + if (err) + return err; + + err = check_btf_line(env, attr, uattr); + if (err) + return err; + + err = check_core_relo(env, attr, uattr); + if (err) + return err; + + return 0; +} diff --git a/kernel/bpf/const_fold.c b/kernel/bpf/const_fold.c new file mode 100644 index 000000000000..db73c4740b1e --- /dev/null +++ b/kernel/bpf/const_fold.c @@ -0,0 +1,396 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bpf_verifier.h> + +/* + * Forward dataflow analysis to determine constant register values at every + * instruction. Tracks 64-bit constant values in R0-R9 through the program, + * using a fixed-point iteration in reverse postorder. Records which registers + * hold known constants and their values in + * env->insn_aux_data[].{const_reg_mask, const_reg_vals}. + */ + +enum const_arg_state { + CONST_ARG_UNVISITED, /* instruction not yet reached */ + CONST_ARG_UNKNOWN, /* register value not a known constant */ + CONST_ARG_CONST, /* register holds a known 64-bit constant */ + CONST_ARG_MAP_PTR, /* register holds a map pointer, map_index is set */ + CONST_ARG_MAP_VALUE, /* register points to map value data, val is offset */ + CONST_ARG_SUBPROG, /* register holds a subprog pointer, val is subprog number */ +}; + +struct const_arg_info { + enum const_arg_state state; + u32 map_index; + u64 val; +}; + +static bool ci_is_unvisited(const struct const_arg_info *ci) +{ + return ci->state == CONST_ARG_UNVISITED; +} + +static bool ci_is_unknown(const struct const_arg_info *ci) +{ + return ci->state == CONST_ARG_UNKNOWN; +} + +static bool ci_is_const(const struct const_arg_info *ci) +{ + return ci->state == CONST_ARG_CONST; +} + +static bool ci_is_map_value(const struct const_arg_info *ci) +{ + return ci->state == CONST_ARG_MAP_VALUE; +} + +/* Transfer function: compute output register state from instruction. */ +static void const_reg_xfer(struct bpf_verifier_env *env, struct const_arg_info *ci_out, + struct bpf_insn *insn, struct bpf_insn *insns, int idx) +{ + struct const_arg_info unknown = { .state = CONST_ARG_UNKNOWN, .val = 0 }; + struct const_arg_info *dst = &ci_out[insn->dst_reg]; + struct const_arg_info *src = &ci_out[insn->src_reg]; + u8 class = BPF_CLASS(insn->code); + u8 mode = BPF_MODE(insn->code); + u8 opcode = BPF_OP(insn->code) | BPF_SRC(insn->code); + int r; + + switch (class) { + case BPF_ALU: + case BPF_ALU64: + switch (opcode) { + case BPF_MOV | BPF_K: + dst->state = CONST_ARG_CONST; + dst->val = (s64)insn->imm; + break; + case BPF_MOV | BPF_X: + *dst = *src; + if (!insn->off) + break; + if (!ci_is_const(dst)) { + *dst = unknown; + break; + } + switch (insn->off) { + case 8: dst->val = (s8)dst->val; break; + case 16: dst->val = (s16)dst->val; break; + case 32: dst->val = (s32)dst->val; break; + default: *dst = unknown; break; + } + break; + case BPF_ADD | BPF_K: + if (!ci_is_const(dst) && !ci_is_map_value(dst)) { + *dst = unknown; + break; + } + dst->val += insn->imm; + break; + case BPF_SUB | BPF_K: + if (!ci_is_const(dst) && !ci_is_map_value(dst)) { + *dst = unknown; + break; + } + dst->val -= insn->imm; + break; + case BPF_AND | BPF_K: + if (!ci_is_const(dst)) { + if (!insn->imm) { + dst->state = CONST_ARG_CONST; + dst->val = 0; + } else { + *dst = unknown; + } + break; + } + dst->val &= (s64)insn->imm; + break; + case BPF_AND | BPF_X: + if (ci_is_const(dst) && dst->val == 0) + break; /* 0 & x == 0 */ + if (ci_is_const(src) && src->val == 0) { + dst->state = CONST_ARG_CONST; + dst->val = 0; + break; + } + if (!ci_is_const(dst) || !ci_is_const(src)) { + *dst = unknown; + break; + } + dst->val &= src->val; + break; + default: + *dst = unknown; + break; + } + if (class == BPF_ALU) { + if (ci_is_const(dst)) + dst->val = (u32)dst->val; + else if (!ci_is_unknown(dst)) + *dst = unknown; + } + break; + case BPF_LD: + if (mode == BPF_ABS || mode == BPF_IND) + goto process_call; + if (mode != BPF_IMM || BPF_SIZE(insn->code) != BPF_DW) + break; + if (insn->src_reg == BPF_PSEUDO_FUNC) { + int subprog = bpf_find_subprog(env, idx + insn->imm + 1); + + if (subprog >= 0) { + dst->state = CONST_ARG_SUBPROG; + dst->val = subprog; + } else { + *dst = unknown; + } + } else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || + insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { + dst->state = CONST_ARG_MAP_VALUE; + dst->map_index = env->insn_aux_data[idx].map_index; + dst->val = env->insn_aux_data[idx].map_off; + } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || + insn->src_reg == BPF_PSEUDO_MAP_IDX) { + dst->state = CONST_ARG_MAP_PTR; + dst->map_index = env->insn_aux_data[idx].map_index; + } else if (insn->src_reg == 0) { + dst->state = CONST_ARG_CONST; + dst->val = (u64)(u32)insn->imm | ((u64)(u32)insns[idx + 1].imm << 32); + } else { + *dst = unknown; + } + break; + case BPF_LDX: + if (!ci_is_map_value(src)) { + *dst = unknown; + break; + } + struct bpf_map *map = env->used_maps[src->map_index]; + int size = bpf_size_to_bytes(BPF_SIZE(insn->code)); + bool is_ldsx = mode == BPF_MEMSX; + int off = src->val + insn->off; + u64 val = 0; + + if (!bpf_map_is_rdonly(map) || !map->ops->map_direct_value_addr || + map->map_type == BPF_MAP_TYPE_INSN_ARRAY || + off < 0 || off + size > map->value_size || + bpf_map_direct_read(map, off, size, &val, is_ldsx)) { + *dst = unknown; + break; + } + dst->state = CONST_ARG_CONST; + dst->val = val; + break; + case BPF_JMP: + if (opcode != BPF_CALL) + break; +process_call: + for (r = BPF_REG_0; r <= BPF_REG_5; r++) + ci_out[r] = unknown; + break; + case BPF_STX: + if (mode != BPF_ATOMIC) + break; + if (insn->imm == BPF_CMPXCHG) + ci_out[BPF_REG_0] = unknown; + else if (insn->imm == BPF_LOAD_ACQ) + *dst = unknown; + else if (insn->imm & BPF_FETCH) + *src = unknown; + break; + } +} + +/* Join function: merge output state into a successor's input state. */ +static bool const_reg_join(struct const_arg_info *ci_target, + struct const_arg_info *ci_out) +{ + bool changed = false; + int r; + + for (r = 0; r < MAX_BPF_REG; r++) { + struct const_arg_info *old = &ci_target[r]; + struct const_arg_info *new = &ci_out[r]; + + if (ci_is_unvisited(old) && !ci_is_unvisited(new)) { + ci_target[r] = *new; + changed = true; + } else if (!ci_is_unknown(old) && !ci_is_unvisited(old) && + (new->state != old->state || new->val != old->val || + new->map_index != old->map_index)) { + old->state = CONST_ARG_UNKNOWN; + changed = true; + } + } + return changed; +} + +int bpf_compute_const_regs(struct bpf_verifier_env *env) +{ + struct const_arg_info unknown = { .state = CONST_ARG_UNKNOWN, .val = 0 }; + struct bpf_insn_aux_data *insn_aux = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + int insn_cnt = env->prog->len; + struct const_arg_info (*ci_in)[MAX_BPF_REG]; + struct const_arg_info ci_out[MAX_BPF_REG]; + struct bpf_iarray *succ; + bool changed; + int i, r; + + /* kvzalloc zeroes memory, so all entries start as CONST_ARG_UNVISITED (0) */ + ci_in = kvzalloc_objs(*ci_in, insn_cnt, GFP_KERNEL_ACCOUNT); + if (!ci_in) + return -ENOMEM; + + /* Subprogram entries (including main at subprog 0): all registers unknown */ + for (i = 0; i < env->subprog_cnt; i++) { + int start = env->subprog_info[i].start; + + for (r = 0; r < MAX_BPF_REG; r++) + ci_in[start][r] = unknown; + } + +redo: + changed = false; + for (i = env->cfg.cur_postorder - 1; i >= 0; i--) { + int idx = env->cfg.insn_postorder[i]; + struct bpf_insn *insn = &insns[idx]; + struct const_arg_info *ci = ci_in[idx]; + + memcpy(ci_out, ci, sizeof(ci_out)); + + const_reg_xfer(env, ci_out, insn, insns, idx); + + succ = bpf_insn_successors(env, idx); + for (int s = 0; s < succ->cnt; s++) + changed |= const_reg_join(ci_in[succ->items[s]], ci_out); + } + if (changed) + goto redo; + + /* Save computed constants into insn_aux[] if they fit into 32-bit */ + for (i = 0; i < insn_cnt; i++) { + u16 mask = 0, map_mask = 0, subprog_mask = 0; + struct bpf_insn_aux_data *aux = &insn_aux[i]; + struct const_arg_info *ci = ci_in[i]; + + for (r = BPF_REG_0; r < ARRAY_SIZE(aux->const_reg_vals); r++) { + struct const_arg_info *c = &ci[r]; + + switch (c->state) { + case CONST_ARG_CONST: { + u64 val = c->val; + + if (val != (u32)val) + break; + mask |= BIT(r); + aux->const_reg_vals[r] = val; + break; + } + case CONST_ARG_MAP_PTR: + map_mask |= BIT(r); + aux->const_reg_vals[r] = c->map_index; + break; + case CONST_ARG_SUBPROG: + subprog_mask |= BIT(r); + aux->const_reg_vals[r] = c->val; + break; + default: + break; + } + } + aux->const_reg_mask = mask; + aux->const_reg_map_mask = map_mask; + aux->const_reg_subprog_mask = subprog_mask; + } + + kvfree(ci_in); + return 0; +} + +static int eval_const_branch(u8 opcode, u64 dst_val, u64 src_val) +{ + switch (BPF_OP(opcode)) { + case BPF_JEQ: return dst_val == src_val; + case BPF_JNE: return dst_val != src_val; + case BPF_JGT: return dst_val > src_val; + case BPF_JGE: return dst_val >= src_val; + case BPF_JLT: return dst_val < src_val; + case BPF_JLE: return dst_val <= src_val; + case BPF_JSGT: return (s64)dst_val > (s64)src_val; + case BPF_JSGE: return (s64)dst_val >= (s64)src_val; + case BPF_JSLT: return (s64)dst_val < (s64)src_val; + case BPF_JSLE: return (s64)dst_val <= (s64)src_val; + case BPF_JSET: return (bool)(dst_val & src_val); + default: return -1; + } +} + +/* + * Rewrite conditional branches with constant outcomes into unconditional + * jumps using register values resolved by bpf_compute_const_regs() pass. + * This eliminates dead edges from the CFG so that compute_live_registers() + * doesn't propagate liveness through dead code. + */ +int bpf_prune_dead_branches(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *insn_aux = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + int insn_cnt = env->prog->len; + bool changed = false; + int i; + + for (i = 0; i < insn_cnt; i++) { + struct bpf_insn_aux_data *aux = &insn_aux[i]; + struct bpf_insn *insn = &insns[i]; + u8 class = BPF_CLASS(insn->code); + u64 dst_val, src_val; + int taken; + + if (!bpf_insn_is_cond_jump(insn->code)) + continue; + if (bpf_is_may_goto_insn(insn)) + continue; + + if (!(aux->const_reg_mask & BIT(insn->dst_reg))) + continue; + dst_val = aux->const_reg_vals[insn->dst_reg]; + + if (BPF_SRC(insn->code) == BPF_K) { + src_val = insn->imm; + } else { + if (!(aux->const_reg_mask & BIT(insn->src_reg))) + continue; + src_val = aux->const_reg_vals[insn->src_reg]; + } + + if (class == BPF_JMP32) { + /* + * The (s32) cast maps the 32-bit range into two u64 sub-ranges: + * [0x00000000, 0x7FFFFFFF] -> [0x0000000000000000, 0x000000007FFFFFFF] + * [0x80000000, 0xFFFFFFFF] -> [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF] + * The ordering is preserved within each sub-range, and + * the second sub-range is above the first as u64. + */ + dst_val = (s32)dst_val; + src_val = (s32)src_val; + } + + taken = eval_const_branch(insn->code, dst_val, src_val); + if (taken < 0) { + bpf_log(&env->log, "Unknown conditional jump %x\n", insn->code); + return -EFAULT; + } + *insn = BPF_JMP_A(taken ? insn->off : 0); + changed = true; + } + + if (!changed) + return 0; + /* recompute postorder, since CFG has changed */ + kvfree(env->cfg.insn_postorder); + env->cfg.insn_postorder = NULL; + return bpf_compute_postorder(env); +} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3ece2da55625..8b018ff48875 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -18,7 +18,6 @@ */ #include <uapi/linux/btf.h> -#include <crypto/sha1.h> #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> @@ -1422,6 +1421,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off); break; + + case BPF_ST | BPF_PROBE_MEM32 | BPF_DW: + case BPF_ST | BPF_PROBE_MEM32 | BPF_W: + case BPF_ST | BPF_PROBE_MEM32 | BPF_H: + case BPF_ST | BPF_PROBE_MEM32 | BPF_B: + *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ + from->imm); + *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + /* + * Cannot use BPF_STX_MEM() macro here as it + * hardcodes BPF_MEM mode, losing PROBE_MEM32 + * and breaking arena addressing in the JIT. + */ + *to++ = (struct bpf_insn) { + .code = BPF_STX | BPF_PROBE_MEM32 | + BPF_SIZE(from->code), + .dst_reg = from->dst_reg, + .src_reg = BPF_REG_AX, + .off = from->off, + }; + break; } out: return to - to_buff; @@ -1466,27 +1486,16 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) * know whether fp here is the clone or the original. */ fp->aux->prog = fp; + if (fp->aux->offload) + fp->aux->offload->prog = fp; bpf_prog_clone_free(fp_other); } -static void adjust_insn_arrays(struct bpf_prog *prog, u32 off, u32 len) -{ -#ifdef CONFIG_BPF_SYSCALL - struct bpf_map *map; - int i; - - if (len <= 1) - return; - - for (i = 0; i < prog->aux->used_map_cnt; i++) { - map = prog->aux->used_maps[i]; - if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) - bpf_insn_array_adjust(map, off, len); - } -#endif -} - -struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) +/* + * Now this function is used only to blind the main prog and must be invoked only when + * bpf_prog_need_blind() returns true. + */ +struct bpf_prog *bpf_jit_blind_constants(struct bpf_verifier_env *env, struct bpf_prog *prog) { struct bpf_insn insn_buff[16], aux[2]; struct bpf_prog *clone, *tmp; @@ -1494,13 +1503,17 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!prog->blinding_requested || prog->blinded) - return prog; + if (WARN_ON_ONCE(env && env->prog != prog)) + return ERR_PTR(-EINVAL); clone = bpf_prog_clone_create(prog, GFP_USER); if (!clone) return ERR_PTR(-ENOMEM); + /* make sure bpf_patch_insn_data() patches the correct prog */ + if (env) + env->prog = clone; + insn_cnt = clone->len; insn = clone->insnsi; @@ -1528,21 +1541,28 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) if (!rewritten) continue; - tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); - if (IS_ERR(tmp)) { + if (env) + tmp = bpf_patch_insn_data(env, i, insn_buff, rewritten); + else + tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); + + if (IS_ERR_OR_NULL(tmp)) { + if (env) + /* restore the original prog */ + env->prog = prog; /* Patching may have repointed aux->prog during * realloc from the original one, so we need to * fix it up here on error. */ bpf_jit_prog_release_other(prog, clone); - return tmp; + return IS_ERR(tmp) ? tmp : ERR_PTR(-ENOMEM); } clone = tmp; insn_delta = rewritten - 1; - /* Instructions arrays must be updated using absolute xlated offsets */ - adjust_insn_arrays(clone, prog->aux->subprog_start + i, rewritten); + if (env) + env->prog = clone; /* Walk new program and skip insns we just inserted. */ insn = clone->insnsi + i + insn_delta; @@ -1553,6 +1573,15 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) clone->blinded = 1; return clone; } + +bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struct bpf_prog *prog, + int insn_idx) +{ + if (!env) + return false; + insn_idx += prog->aux->subprog_start; + return env->insn_aux_data[insn_idx].indirect_target; +} #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, @@ -1736,6 +1765,12 @@ bool bpf_opcode_in_insntable(u8 code) } #ifndef CONFIG_BPF_JIT_ALWAYS_ON +/* Absolute value of s32 without undefined behavior for S32_MIN */ +static u32 abs_s32(s32 x) +{ + return x >= 0 ? (u32)x : -(u32)x; +} + /** * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -1900,8 +1935,8 @@ select_insn: DST = do_div(AX, (u32) SRC); break; case 1: - AX = abs((s32)DST); - AX = do_div(AX, abs((s32)SRC)); + AX = abs_s32((s32)DST); + AX = do_div(AX, abs_s32((s32)SRC)); if ((s32)DST < 0) DST = (u32)-AX; else @@ -1928,8 +1963,8 @@ select_insn: DST = do_div(AX, (u32) IMM); break; case 1: - AX = abs((s32)DST); - AX = do_div(AX, abs((s32)IMM)); + AX = abs_s32((s32)DST); + AX = do_div(AX, abs_s32((s32)IMM)); if ((s32)DST < 0) DST = (u32)-AX; else @@ -1955,8 +1990,8 @@ select_insn: DST = (u32) AX; break; case 1: - AX = abs((s32)DST); - do_div(AX, abs((s32)SRC)); + AX = abs_s32((s32)DST); + do_div(AX, abs_s32((s32)SRC)); if (((s32)DST < 0) == ((s32)SRC < 0)) DST = (u32)AX; else @@ -1982,8 +2017,8 @@ select_insn: DST = (u32) AX; break; case 1: - AX = abs((s32)DST); - do_div(AX, abs((s32)IMM)); + AX = abs_s32((s32)DST); + do_div(AX, abs_s32((s32)IMM)); if (((s32)DST < 0) == ((s32)IMM < 0)) DST = (u32)AX; else @@ -2060,12 +2095,12 @@ select_insn: if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) goto out; - tail_call_cnt++; - prog = READ_ONCE(array->ptrs[index]); if (!prog) goto out; + tail_call_cnt++; + /* ARG1 at this point is guaranteed to point to CTX from * the verifier side due to the fact that the tail call is * handled like a helper, that is, bpf_tail_call_proto, @@ -2505,18 +2540,55 @@ static bool bpf_prog_select_interpreter(struct bpf_prog *fp) return select_interpreter; } -/** - * bpf_prog_select_runtime - select exec runtime for BPF program - * @fp: bpf_prog populated with BPF program - * @err: pointer to error variable - * - * Try to JIT eBPF program, if JIT is not available, use interpreter. - * The BPF program will be executed via bpf_prog_run() function. - * - * Return: the &fp argument along with &err set to 0 for success or - * a negative errno code on failure - */ -struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) +static struct bpf_prog *bpf_prog_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) +{ +#ifdef CONFIG_BPF_JIT + struct bpf_prog *orig_prog; + struct bpf_insn_aux_data *orig_insn_aux; + + if (!bpf_prog_need_blind(prog)) + return bpf_int_jit_compile(env, prog); + + if (env) { + /* + * If env is not NULL, we are called from the end of bpf_check(), at this + * point, only insn_aux_data is used after failure, so it should be restored + * on failure. + */ + orig_insn_aux = bpf_dup_insn_aux_data(env); + if (!orig_insn_aux) + return prog; + } + + orig_prog = prog; + prog = bpf_jit_blind_constants(env, prog); + /* + * If blinding was requested and we failed during blinding, we must fall + * back to the interpreter. + */ + if (IS_ERR(prog)) + goto out_restore; + + prog = bpf_int_jit_compile(env, prog); + if (prog->jited) { + bpf_jit_prog_release_other(prog, orig_prog); + if (env) + vfree(orig_insn_aux); + return prog; + } + + bpf_jit_prog_release_other(orig_prog, prog); + +out_restore: + prog = orig_prog; + if (env) + bpf_restore_insn_aux_data(env, orig_insn_aux); +#endif + return prog; +} + +struct bpf_prog *__bpf_prog_select_runtime(struct bpf_verifier_env *env, struct bpf_prog *fp, + int *err) { /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. @@ -2544,7 +2616,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) if (*err) return fp; - fp = bpf_int_jit_compile(fp); + fp = bpf_prog_jit_compile(env, fp); bpf_prog_jit_attempt_done(fp); if (!fp->jited && jit_needed) { *err = -ENOTSUPP; @@ -2570,6 +2642,22 @@ finalize: return fp; } + +/** + * bpf_prog_select_runtime - select exec runtime for BPF program + * @fp: bpf_prog populated with BPF program + * @err: pointer to error variable + * + * Try to JIT eBPF program, if JIT is not available, use interpreter. + * The BPF program will be executed via bpf_prog_run() function. + * + * Return: the &fp argument along with &err set to 0 for success or + * a negative errno code on failure + */ +struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) +{ + return __bpf_prog_select_runtime(NULL, fp, err); +} EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); static unsigned int __bpf_prog_ret1(const void *ctx, @@ -2586,8 +2674,10 @@ static struct bpf_prog_dummy { }, }; -struct bpf_empty_prog_array bpf_empty_prog_array = { - .null_prog = NULL, +struct bpf_prog_array bpf_empty_prog_array = { + .items = { + { .prog = NULL }, + }, }; EXPORT_SYMBOL(bpf_empty_prog_array); @@ -2598,14 +2688,14 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) if (prog_cnt) p = kzalloc_flex(*p, items, prog_cnt + 1, flags); else - p = &bpf_empty_prog_array.hdr; + p = &bpf_empty_prog_array; return p; } void bpf_prog_array_free(struct bpf_prog_array *progs) { - if (!progs || progs == &bpf_empty_prog_array.hdr) + if (!progs || progs == &bpf_empty_prog_array) return; kfree_rcu(progs, rcu); } @@ -2614,19 +2704,17 @@ static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) { struct bpf_prog_array *progs; - /* If RCU Tasks Trace grace period implies RCU grace period, there is - * no need to call kfree_rcu(), just call kfree() directly. + /* + * RCU Tasks Trace grace period implies RCU grace period, there is no + * need to call kfree_rcu(), just call kfree() directly. */ progs = container_of(rcu, struct bpf_prog_array, rcu); - if (rcu_trace_implies_rcu_gp()) - kfree(progs); - else - kfree_rcu(progs, rcu); + kfree(progs); } void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) { - if (!progs || progs == &bpf_empty_prog_array.hdr) + if (!progs || progs == &bpf_empty_prog_array) return; call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb); } @@ -3057,7 +3145,7 @@ const struct bpf_func_proto bpf_tail_call_proto = { * It is encouraged to implement bpf_int_jit_compile() instead, so that * eBPF and implicitly also cBPF can get JITed! */ -struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) +struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog) { return prog; } @@ -3287,6 +3375,63 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); #ifdef CONFIG_BPF_SYSCALL +void bpf_get_linfo_file_line(struct btf *btf, const struct bpf_line_info *linfo, + const char **filep, const char **linep, int *nump) +{ + /* Get base component of the file path. */ + if (filep) { + *filep = btf_name_by_offset(btf, linfo->file_name_off); + *filep = kbasename(*filep); + } + + /* Obtain the source line, and strip whitespace in prefix. */ + if (linep) { + *linep = btf_name_by_offset(btf, linfo->line_off); + while (isspace(**linep)) + *linep += 1; + } + + if (nump) + *nump = BPF_LINE_INFO_LINE_NUM(linfo->line_col); +} + +const struct bpf_line_info *bpf_find_linfo(const struct bpf_prog *prog, u32 insn_off) +{ + const struct bpf_line_info *linfo; + u32 nr_linfo; + int l, r, m; + + nr_linfo = prog->aux->nr_linfo; + if (!nr_linfo || insn_off >= prog->len) + return NULL; + + linfo = prog->aux->linfo; + /* Loop invariant: linfo[l].insn_off <= insns_off. + * linfo[0].insn_off == 0 which always satisfies above condition. + * Binary search is searching for rightmost linfo entry that satisfies + * the above invariant, giving us the desired record that covers given + * instruction offset. + */ + l = 0; + r = nr_linfo - 1; + while (l < r) { + /* (r - l + 1) / 2 means we break a tie to the right, so if: + * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off, + * then m=2, we see that linfo[m].insn_off > insn_off, and so + * r becomes 1 and we exit the loop with correct l==1. + * If the tie was broken to the left, m=1 would end us up in + * an endless loop where l and m stay at 1 and r stays at 2. + */ + m = l + (r - l + 1) / 2; + if (linfo[m].insn_off <= insn_off) + l = m; + else + r = m - 1; + } + + return &linfo[l]; +} + int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, const char **linep, int *nump) { @@ -3321,14 +3466,7 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char * if (idx == -1) return -ENOENT; - /* Get base component of the file path. */ - *filep = btf_name_by_offset(btf, linfo[idx].file_name_off); - *filep = kbasename(*filep); - /* Obtain the source line, and strip whitespace in prefix. */ - *linep = btf_name_by_offset(btf, linfo[idx].line_off); - while (isspace(**linep)) - *linep += 1; - *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col); + bpf_get_linfo_file_line(btf, &linfo[idx], filep, linep, nump); return 0; } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 04171fbc39cb..5e59ab896f05 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -29,6 +29,7 @@ #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/kthread.h> +#include <linux/local_lock.h> #include <linux/completion.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> @@ -52,6 +53,7 @@ struct xdp_bulk_queue { struct list_head flush_node; struct bpf_cpu_map_entry *obj; unsigned int count; + local_lock_t bq_lock; }; /* Struct for every remote "destination" CPU in map */ @@ -221,7 +223,10 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } break; default: - bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); + bpf_warn_invalid_xdp_action(xdpf->dev_rx, rcpu->prog, act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(xdpf->dev_rx, rcpu->prog, act); fallthrough; case XDP_DROP: xdp_return_frame(xdpf); @@ -451,6 +456,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, for_each_possible_cpu(i) { bq = per_cpu_ptr(rcpu->bulkq, i); bq->obj = rcpu; + local_lock_init(&bq->bq_lock); } /* Alloc queue */ @@ -722,6 +728,8 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq) struct ptr_ring *q; int i; + lockdep_assert_held(&bq->bq_lock); + if (unlikely(!bq->count)) return; @@ -749,11 +757,15 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq) } /* Runs under RCU-read-side, plus in softirq under NAPI protection. - * Thus, safe percpu variable access. + * Thus, safe percpu variable access. PREEMPT_RT relies on + * local_lock_nested_bh() to serialise access to the per-CPU bq. */ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { - struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); + struct xdp_bulk_queue *bq; + + local_lock_nested_bh(&rcpu->bulkq->bq_lock); + bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) bq_flush_to_queue(bq); @@ -774,6 +786,8 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) list_add(&bq->flush_node, flush_list); } + + local_unlock_nested_bh(&rcpu->bulkq->bq_lock); } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, @@ -810,7 +824,9 @@ void __cpu_map_flush(struct list_head *flush_list) struct xdp_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + local_lock_nested_bh(&bq->obj->bulkq->bq_lock); bq_flush_to_queue(bq); + local_unlock_nested_bh(&bq->obj->bulkq->bq_lock); /* If already running, costs spin_lock_irqsave + smb_mb */ wake_up_process(bq->obj->kthread); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2625601de76e..cc0a43ebab6b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -45,6 +45,7 @@ * types of devmap; only the lookup and insertion is different. */ #include <linux/bpf.h> +#include <linux/local_lock.h> #include <net/xdp.h> #include <linux/filter.h> #include <trace/events/xdp.h> @@ -60,6 +61,7 @@ struct xdp_dev_bulk_queue { struct net_device *dev_rx; struct bpf_prog *xdp_prog; unsigned int count; + local_lock_t bq_lock; }; struct bpf_dtab_netdev { @@ -381,6 +383,8 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) int to_send = cnt; int i; + lockdep_assert_held(&bq->bq_lock); + if (unlikely(!cnt)) return; @@ -425,10 +429,12 @@ void __dev_flush(struct list_head *flush_list) struct xdp_dev_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + local_lock_nested_bh(&bq->dev->xdp_bulkq->bq_lock); bq_xmit_all(bq, XDP_XMIT_FLUSH); bq->dev_rx = NULL; bq->xdp_prog = NULL; __list_del_clearprev(&bq->flush_node); + local_unlock_nested_bh(&bq->dev->xdp_bulkq->bq_lock); } } @@ -451,12 +457,16 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu * variable access, and map elements stick around. See comment above - * xdp_do_flush() in filter.c. + * xdp_do_flush() in filter.c. PREEMPT_RT relies on local_lock_nested_bh() + * to serialise access to the per-CPU bq. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { - struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); + struct xdp_dev_bulk_queue *bq; + + local_lock_nested_bh(&dev->xdp_bulkq->bq_lock); + bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); @@ -477,6 +487,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, } bq->q[bq->count++] = xdpf; + + local_unlock_nested_bh(&dev->xdp_bulkq->bq_lock); } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, @@ -588,18 +600,22 @@ static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifin } /* Get ifindex of each upper device. 'indexes' must be able to hold at - * least MAX_NEST_DEV elements. - * Returns the number of ifindexes added. + * least 'max' elements. + * Returns the number of ifindexes added, or -EOVERFLOW if there are too + * many upper devices. */ -static int get_upper_ifindexes(struct net_device *dev, int *indexes) +static int get_upper_ifindexes(struct net_device *dev, int *indexes, int max) { struct net_device *upper; struct list_head *iter; int n = 0; netdev_for_each_upper_dev_rcu(dev, upper, iter) { + if (n >= max) + return -EOVERFLOW; indexes[n++] = upper->ifindex; } + return n; } @@ -615,7 +631,11 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, int err; if (exclude_ingress) { - num_excluded = get_upper_ifindexes(dev_rx, excluded_devices); + num_excluded = get_upper_ifindexes(dev_rx, excluded_devices, + ARRAY_SIZE(excluded_devices) - 1); + if (num_excluded < 0) + return num_excluded; + excluded_devices[num_excluded++] = dev_rx->ifindex; } @@ -645,7 +665,7 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_rcu(dst, head, index_hlist, - lockdep_is_held(&dtab->index_lock)) { + rcu_read_lock_bh_held()) { if (!is_valid_dst(dst, xdpf)) continue; @@ -727,13 +747,16 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; - struct hlist_node *next; int num_excluded = 0; unsigned int i; int err; if (exclude_ingress) { - num_excluded = get_upper_ifindexes(dev, excluded_devices); + num_excluded = get_upper_ifindexes(dev, excluded_devices, + ARRAY_SIZE(excluded_devices) - 1); + if (num_excluded < 0) + return num_excluded; + excluded_devices[num_excluded++] = dev->ifindex; } @@ -763,7 +786,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); - hlist_for_each_entry_safe(dst, next, head, index_hlist) { + hlist_for_each_entry_rcu(dst, head, index_hlist, rcu_read_lock_bh_held()) { if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; @@ -1115,8 +1138,13 @@ static int dev_map_notification(struct notifier_block *notifier, if (!netdev->xdp_bulkq) return NOTIFY_BAD; - for_each_possible_cpu(cpu) - per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; + for_each_possible_cpu(cpu) { + struct xdp_dev_bulk_queue *bq; + + bq = per_cpu_ptr(netdev->xdp_bulkq, cpu); + bq->dev = netdev; + local_lock_init(&bq->bq_lock); + } break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c new file mode 100644 index 000000000000..fba9e8c00878 --- /dev/null +++ b/kernel/bpf/fixups.c @@ -0,0 +1,2570 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/bpf_verifier.h> +#include <linux/filter.h> +#include <linux/vmalloc.h> +#include <linux/bsearch.h> +#include <linux/sort.h> +#include <linux/perf_event.h> +#include <net/xdp.h> +#include "disasm.h" + +#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) + +static bool is_cmpxchg_insn(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_CMPXCHG; +} + +/* Return the regno defined by the insn, or -1. */ +static int insn_def_regno(const struct bpf_insn *insn) +{ + switch (BPF_CLASS(insn->code)) { + case BPF_JMP: + case BPF_JMP32: + case BPF_ST: + return -1; + case BPF_STX: + if (BPF_MODE(insn->code) == BPF_ATOMIC || + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) { + if (insn->imm == BPF_CMPXCHG) + return BPF_REG_0; + else if (insn->imm == BPF_LOAD_ACQ) + return insn->dst_reg; + else if (insn->imm & BPF_FETCH) + return insn->src_reg; + } + return -1; + default: + return insn->dst_reg; + } +} + +/* Return TRUE if INSN has defined any 32-bit value explicitly. */ +static bool insn_has_def32(struct bpf_insn *insn) +{ + int dst_reg = insn_def_regno(insn); + + if (dst_reg == -1) + return false; + + return !bpf_is_reg64(insn, dst_reg, NULL, DST_OP); +} + +static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b) +{ + const struct bpf_kfunc_desc *d0 = a; + const struct bpf_kfunc_desc *d1 = b; + + if (d0->imm != d1->imm) + return d0->imm < d1->imm ? -1 : 1; + if (d0->offset != d1->offset) + return d0->offset < d1->offset ? -1 : 1; + return 0; +} + +const struct btf_func_model * +bpf_jit_find_kfunc_model(const struct bpf_prog *prog, + const struct bpf_insn *insn) +{ + const struct bpf_kfunc_desc desc = { + .imm = insn->imm, + .offset = insn->off, + }; + const struct bpf_kfunc_desc *res; + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + res = bsearch(&desc, tab->descs, tab->nr_descs, + sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off); + + return res ? &res->func_model : NULL; +} + +static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc) +{ + unsigned long call_imm; + + if (bpf_jit_supports_far_kfunc_call()) { + call_imm = desc->func_id; + } else { + call_imm = BPF_CALL_IMM(desc->addr); + /* Check whether the relative offset overflows desc->imm */ + if ((unsigned long)(s32)call_imm != call_imm) { + verbose(env, "address of kernel func_id %u is out of range\n", + desc->func_id); + return -EINVAL; + } + } + desc->imm = call_imm; + return 0; +} + +static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env) +{ + struct bpf_kfunc_desc_tab *tab; + int i, err; + + tab = env->prog->aux->kfunc_tab; + if (!tab) + return 0; + + for (i = 0; i < tab->nr_descs; i++) { + err = set_kfunc_desc_imm(env, &tab->descs[i]); + if (err) + return err; + } + + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_imm_off, NULL); + return 0; +} + +static int add_kfunc_in_insns(struct bpf_verifier_env *env, + struct bpf_insn *insn, int cnt) +{ + int i, ret; + + for (i = 0; i < cnt; i++, insn++) { + if (bpf_pseudo_kfunc_call(insn)) { + ret = bpf_add_kfunc_call(env, insn->imm, insn->off); + if (ret < 0) + return ret; + } + } + return 0; +} + +#ifndef CONFIG_BPF_JIT_ALWAYS_ON +static int get_callee_stack_depth(struct bpf_verifier_env *env, + const struct bpf_insn *insn, int idx) +{ + int start = idx + insn->imm + 1, subprog; + + subprog = bpf_find_subprog(env, start); + if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start)) + return -EFAULT; + return env->subprog_info[subprog].stack_depth; +} +#endif + +/* single env->prog->insni[off] instruction was replaced with the range + * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying + * [0, off) and [off, end) to new locations, so the patched range stays zero + */ +static void adjust_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_prog *new_prog, u32 off, u32 cnt) +{ + struct bpf_insn_aux_data *data = env->insn_aux_data; + struct bpf_insn *insn = new_prog->insnsi; + u32 old_seen = data[off].seen; + u32 prog_len; + int i; + + /* aux info at OFF always needs adjustment, no matter fast path + * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the + * original insn at old prog. + */ + data[off].zext_dst = insn_has_def32(insn + off + cnt - 1); + + if (cnt == 1) + return; + prog_len = new_prog->len; + + memmove(data + off + cnt - 1, data + off, + sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1)); + for (i = off; i < off + cnt - 1; i++) { + /* Expand insni[off]'s seen count to the patched range. */ + data[i].seen = old_seen; + data[i].zext_dst = insn_has_def32(insn + i); + } + + /* + * The indirect_target flag of the original instruction was moved to the last of the + * new instructions by the above memmove and memset, but the indirect jump target is + * actually the first instruction, so move it back. This also matches with the behavior + * of bpf_insn_array_adjust(), which preserves xlated_off to point to the first new + * instruction. + */ + if (data[off + cnt - 1].indirect_target) { + data[off].indirect_target = 1; + data[off + cnt - 1].indirect_target = 0; + } +} + +static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + /* NOTE: fake 'exit' subprog should be updated as well. */ + for (i = 0; i <= env->subprog_cnt; i++) { + if (env->subprog_info[i].start <= off) + continue; + env->subprog_info[i].start += len - 1; + } +} + +static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust(env->insn_array_maps[i], off, len); +} + +static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + for (i = 0; i < env->insn_array_map_cnt; i++) + bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len); +} + +static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len) +{ + struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; + int i, sz = prog->aux->size_poke_tab; + struct bpf_jit_poke_descriptor *desc; + + for (i = 0; i < sz; i++) { + desc = &tab[i]; + if (desc->insn_idx <= off) + continue; + desc->insn_idx += len - 1; + } +} + +struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, + const struct bpf_insn *patch, u32 len) +{ + struct bpf_prog *new_prog; + struct bpf_insn_aux_data *new_data = NULL; + + if (len > 1) { + new_data = vrealloc(env->insn_aux_data, + array_size(env->prog->len + len - 1, + sizeof(struct bpf_insn_aux_data)), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!new_data) + return NULL; + + env->insn_aux_data = new_data; + } + + new_prog = bpf_patch_insn_single(env->prog, off, patch, len); + if (IS_ERR(new_prog)) { + if (PTR_ERR(new_prog) == -ERANGE) + verbose(env, + "insn %d cannot be patched due to 16-bit range\n", + env->insn_aux_data[off].orig_idx); + return NULL; + } + adjust_insn_aux_data(env, new_prog, off, len); + adjust_subprog_starts(env, off, len); + adjust_insn_arrays(env, off, len); + adjust_poke_descs(new_prog, off, len); + return new_prog; +} + +/* + * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the + * jump offset by 'delta'. + */ +static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta) +{ + struct bpf_insn *insn = prog->insnsi; + u32 insn_cnt = prog->len, i; + s32 imm; + s16 off; + + for (i = 0; i < insn_cnt; i++, insn++) { + u8 code = insn->code; + + if (tgt_idx <= i && i < tgt_idx + delta) + continue; + + if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || + BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT) + continue; + + if (insn->code == (BPF_JMP32 | BPF_JA)) { + if (i + 1 + insn->imm != tgt_idx) + continue; + if (check_add_overflow(insn->imm, delta, &imm)) + return -ERANGE; + insn->imm = imm; + } else { + if (i + 1 + insn->off != tgt_idx) + continue; + if (check_add_overflow(insn->off, delta, &off)) + return -ERANGE; + insn->off = off; + } + } + return 0; +} + +static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, + u32 off, u32 cnt) +{ + int i, j; + + /* find first prog starting at or after off (first to remove) */ + for (i = 0; i < env->subprog_cnt; i++) + if (env->subprog_info[i].start >= off) + break; + /* find first prog starting at or after off + cnt (first to stay) */ + for (j = i; j < env->subprog_cnt; j++) + if (env->subprog_info[j].start >= off + cnt) + break; + /* if j doesn't start exactly at off + cnt, we are just removing + * the front of previous prog + */ + if (env->subprog_info[j].start != off + cnt) + j--; + + if (j > i) { + struct bpf_prog_aux *aux = env->prog->aux; + int move; + + /* move fake 'exit' subprog as well */ + move = env->subprog_cnt + 1 - j; + + memmove(env->subprog_info + i, + env->subprog_info + j, + sizeof(*env->subprog_info) * move); + env->subprog_cnt -= j - i; + + /* remove func_info */ + if (aux->func_info) { + move = aux->func_info_cnt - j; + + memmove(aux->func_info + i, + aux->func_info + j, + sizeof(*aux->func_info) * move); + aux->func_info_cnt -= j - i; + /* func_info->insn_off is set after all code rewrites, + * in adjust_btf_func() - no need to adjust + */ + } + } else { + /* convert i from "first prog to remove" to "first to adjust" */ + if (env->subprog_info[i].start == off) + i++; + } + + /* update fake 'exit' subprog as well */ + for (; i <= env->subprog_cnt; i++) + env->subprog_info[i].start -= cnt; + + return 0; +} + +static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, + u32 cnt) +{ + struct bpf_prog *prog = env->prog; + u32 i, l_off, l_cnt, nr_linfo; + struct bpf_line_info *linfo; + + nr_linfo = prog->aux->nr_linfo; + if (!nr_linfo) + return 0; + + linfo = prog->aux->linfo; + + /* find first line info to remove, count lines to be removed */ + for (i = 0; i < nr_linfo; i++) + if (linfo[i].insn_off >= off) + break; + + l_off = i; + l_cnt = 0; + for (; i < nr_linfo; i++) + if (linfo[i].insn_off < off + cnt) + l_cnt++; + else + break; + + /* First live insn doesn't match first live linfo, it needs to "inherit" + * last removed linfo. prog is already modified, so prog->len == off + * means no live instructions after (tail of the program was removed). + */ + if (prog->len != off && l_cnt && + (i == nr_linfo || linfo[i].insn_off != off + cnt)) { + l_cnt--; + linfo[--i].insn_off = off + cnt; + } + + /* remove the line info which refer to the removed instructions */ + if (l_cnt) { + memmove(linfo + l_off, linfo + i, + sizeof(*linfo) * (nr_linfo - i)); + + prog->aux->nr_linfo -= l_cnt; + nr_linfo = prog->aux->nr_linfo; + } + + /* pull all linfo[i].insn_off >= off + cnt in by cnt */ + for (i = l_off; i < nr_linfo; i++) + linfo[i].insn_off -= cnt; + + /* fix up all subprogs (incl. 'exit') which start >= off */ + for (i = 0; i <= env->subprog_cnt; i++) + if (env->subprog_info[i].linfo_idx > l_off) { + /* program may have started in the removed region but + * may not be fully removed + */ + if (env->subprog_info[i].linfo_idx >= l_off + l_cnt) + env->subprog_info[i].linfo_idx -= l_cnt; + else + env->subprog_info[i].linfo_idx = l_off; + } + + return 0; +} + +/* + * Clean up dynamically allocated fields of aux data for instructions [start, ...] + */ +void bpf_clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + int end = start + len; + int i; + + for (i = start; i < end; i++) { + if (aux_data[i].jt) { + kvfree(aux_data[i].jt); + aux_data[i].jt = NULL; + } + + if (bpf_is_ldimm64(&insns[i])) + i++; + } +} + +static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + unsigned int orig_prog_len = env->prog->len; + int err; + + if (bpf_prog_is_offloaded(env->prog->aux)) + bpf_prog_offload_remove_insns(env, off, cnt); + + /* Should be called before bpf_remove_insns, as it uses prog->insnsi */ + bpf_clear_insn_aux_data(env, off, cnt); + + err = bpf_remove_insns(env->prog, off, cnt); + if (err) + return err; + + err = adjust_subprog_starts_after_remove(env, off, cnt); + if (err) + return err; + + err = bpf_adj_linfo_after_remove(env, off, cnt); + if (err) + return err; + + adjust_insn_arrays_after_remove(env, off, cnt); + + memmove(aux_data + off, aux_data + off + cnt, + sizeof(*aux_data) * (orig_prog_len - off - cnt)); + + return 0; +} + +static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0); +static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0); + +bool bpf_insn_is_cond_jump(u8 code) +{ + u8 op; + + op = BPF_OP(code); + if (BPF_CLASS(code) == BPF_JMP32) + return op != BPF_JA; + + if (BPF_CLASS(code) != BPF_JMP) + return false; + + return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; +} + +void bpf_opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + struct bpf_insn *insn = env->prog->insnsi; + const int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (!bpf_insn_is_cond_jump(insn->code)) + continue; + + if (!aux_data[i + 1].seen) + ja.off = insn->off; + else if (!aux_data[i + 1 + insn->off].seen) + ja.off = 0; + else + continue; + + if (bpf_prog_is_offloaded(env->prog->aux)) + bpf_prog_offload_replace_insn(env, i, &ja); + + memcpy(insn, &ja, sizeof(ja)); + } +} + +int bpf_opt_remove_dead_code(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + int insn_cnt = env->prog->len; + int i, err; + + for (i = 0; i < insn_cnt; i++) { + int j; + + j = 0; + while (i + j < insn_cnt && !aux_data[i + j].seen) + j++; + if (!j) + continue; + + err = verifier_remove_insns(env, i, j); + if (err) + return err; + insn_cnt = env->prog->len; + } + + return 0; +} + +int bpf_opt_remove_nops(struct bpf_verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + bool is_may_goto_0, is_ja; + int i, err; + + for (i = 0; i < insn_cnt; i++) { + is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0)); + is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP)); + + if (!is_may_goto_0 && !is_ja) + continue; + + err = verifier_remove_insns(env, i, 1); + if (err) + return err; + insn_cnt--; + /* Go back one insn to catch may_goto +1; may_goto +0 sequence */ + i -= (is_may_goto_0 && i > 0) ? 2 : 1; + } + + return 0; +} + +int bpf_opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, + const union bpf_attr *attr) +{ + struct bpf_insn *patch; + /* use env->insn_buf as two independent buffers */ + struct bpf_insn *zext_patch = env->insn_buf; + struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2]; + struct bpf_insn_aux_data *aux = env->insn_aux_data; + int i, patch_len, delta = 0, len = env->prog->len; + struct bpf_insn *insns = env->prog->insnsi; + struct bpf_prog *new_prog; + bool rnd_hi32; + + rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32; + zext_patch[1] = BPF_ZEXT_REG(0); + rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0); + rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); + rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX); + for (i = 0; i < len; i++) { + int adj_idx = i + delta; + struct bpf_insn insn; + int load_reg; + + insn = insns[adj_idx]; + load_reg = insn_def_regno(&insn); + if (!aux[adj_idx].zext_dst) { + u8 code, class; + u32 imm_rnd; + + if (!rnd_hi32) + continue; + + code = insn.code; + class = BPF_CLASS(code); + if (load_reg == -1) + continue; + + /* NOTE: arg "reg" (the fourth one) is only used for + * BPF_STX + SRC_OP, so it is safe to pass NULL + * here. + */ + if (bpf_is_reg64(&insn, load_reg, NULL, DST_OP)) { + if (class == BPF_LD && + BPF_MODE(code) == BPF_IMM) + i++; + continue; + } + + /* ctx load could be transformed into wider load. */ + if (class == BPF_LDX && + aux[adj_idx].ptr_type == PTR_TO_CTX) + continue; + + imm_rnd = get_random_u32(); + rnd_hi32_patch[0] = insn; + rnd_hi32_patch[1].imm = imm_rnd; + rnd_hi32_patch[3].dst_reg = load_reg; + patch = rnd_hi32_patch; + patch_len = 4; + goto apply_patch_buffer; + } + + /* Add in an zero-extend instruction if a) the JIT has requested + * it or b) it's a CMPXCHG. + * + * The latter is because: BPF_CMPXCHG always loads a value into + * R0, therefore always zero-extends. However some archs' + * equivalent instruction only does this load when the + * comparison is successful. This detail of CMPXCHG is + * orthogonal to the general zero-extension behaviour of the + * CPU, so it's treated independently of bpf_jit_needs_zext. + */ + if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn)) + continue; + + /* Zero-extension is done by the caller. */ + if (bpf_pseudo_kfunc_call(&insn)) + continue; + + if (verifier_bug_if(load_reg == -1, env, + "zext_dst is set, but no reg is defined")) + return -EFAULT; + + zext_patch[0] = insn; + zext_patch[1].dst_reg = load_reg; + zext_patch[1].src_reg = load_reg; + patch = zext_patch; + patch_len = 2; +apply_patch_buffer: + new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + insns = new_prog->insnsi; + aux = env->insn_aux_data; + delta += patch_len - 1; + } + + return 0; +} + +/* convert load instructions that access fields of a context type into a + * sequence of instructions that access fields of the underlying structure: + * struct __sk_buff -> struct sk_buff + * struct bpf_sock_ops -> struct sock + */ +int bpf_convert_ctx_accesses(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprogs = env->subprog_info; + const struct bpf_verifier_ops *ops = env->ops; + int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0; + const int insn_cnt = env->prog->len; + struct bpf_insn *epilogue_buf = env->epilogue_buf; + struct bpf_insn *insn_buf = env->insn_buf; + struct bpf_insn *insn; + u32 target_size, size_default, off; + struct bpf_prog *new_prog; + enum bpf_access_type type; + bool is_narrower_load; + int epilogue_idx = 0; + + if (ops->gen_epilogue) { + epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, + -(subprogs[0].stack_depth + 8)); + if (epilogue_cnt >= INSN_BUF_SIZE) { + verifier_bug(env, "epilogue is too long"); + return -EFAULT; + } else if (epilogue_cnt) { + /* Save the ARG_PTR_TO_CTX for the epilogue to use */ + cnt = 0; + subprogs[0].stack_depth += 8; + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, + -subprogs[0].stack_depth); + insn_buf[cnt++] = env->prog->insnsi[0]; + new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + delta += cnt - 1; + + ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1); + if (ret < 0) + return ret; + } + } + + if (ops->gen_prologue || env->seen_direct_write) { + if (!ops->gen_prologue) { + verifier_bug(env, "gen_prologue is null"); + return -EFAULT; + } + cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, + env->prog); + if (cnt >= INSN_BUF_SIZE) { + verifier_bug(env, "prologue is too long"); + return -EFAULT; + } else if (cnt) { + new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + env->prog = new_prog; + delta += cnt - 1; + + ret = add_kfunc_in_insns(env, insn_buf, cnt - 1); + if (ret < 0) + return ret; + } + } + + if (delta) + WARN_ON(adjust_jmp_off(env->prog, 0, delta)); + + if (bpf_prog_is_offloaded(env->prog->aux)) + return 0; + + insn = env->prog->insnsi + delta; + + for (i = 0; i < insn_cnt; i++, insn++) { + bpf_convert_ctx_access_t convert_ctx_access; + u8 mode; + + if (env->insn_aux_data[i + delta].nospec) { + WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state); + struct bpf_insn *patch = insn_buf; + + *patch++ = BPF_ST_NOSPEC(); + *patch++ = *insn; + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + /* This can not be easily merged with the + * nospec_result-case, because an insn may require a + * nospec before and after itself. Therefore also do not + * 'continue' here but potentially apply further + * patching to insn. *insn should equal patch[1] now. + */ + } + + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || + insn->code == (BPF_LDX | BPF_MEM | BPF_H) || + insn->code == (BPF_LDX | BPF_MEM | BPF_W) || + insn->code == (BPF_LDX | BPF_MEM | BPF_DW) || + insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) || + insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) || + insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) { + type = BPF_READ; + } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || + insn->code == (BPF_STX | BPF_MEM | BPF_H) || + insn->code == (BPF_STX | BPF_MEM | BPF_W) || + insn->code == (BPF_STX | BPF_MEM | BPF_DW) || + insn->code == (BPF_ST | BPF_MEM | BPF_B) || + insn->code == (BPF_ST | BPF_MEM | BPF_H) || + insn->code == (BPF_ST | BPF_MEM | BPF_W) || + insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { + type = BPF_WRITE; + } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && + env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { + insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); + env->prog->aux->num_exentries++; + continue; + } else if (insn->code == (BPF_JMP | BPF_EXIT) && + epilogue_cnt && + i + delta < subprogs[1].start) { + /* Generate epilogue for the main prog */ + if (epilogue_idx) { + /* jump back to the earlier generated epilogue */ + insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1); + cnt = 1; + } else { + memcpy(insn_buf, epilogue_buf, + epilogue_cnt * sizeof(*epilogue_buf)); + cnt = epilogue_cnt; + /* epilogue_idx cannot be 0. It must have at + * least one ctx ptr saving insn before the + * epilogue. + */ + epilogue_idx = i + delta; + } + goto patch_insn_buf; + } else { + continue; + } + + if (type == BPF_WRITE && + env->insn_aux_data[i + delta].nospec_result) { + /* nospec_result is only used to mitigate Spectre v4 and + * to limit verification-time for Spectre v1. + */ + struct bpf_insn *patch = insn_buf; + + *patch++ = *insn; + *patch++ = BPF_ST_NOSPEC(); + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + + switch ((int)env->insn_aux_data[i + delta].ptr_type) { + case PTR_TO_CTX: + if (!ops->convert_ctx_access) + continue; + convert_ctx_access = ops->convert_ctx_access; + break; + case PTR_TO_SOCKET: + case PTR_TO_SOCK_COMMON: + convert_ctx_access = bpf_sock_convert_ctx_access; + break; + case PTR_TO_TCP_SOCK: + convert_ctx_access = bpf_tcp_sock_convert_ctx_access; + break; + case PTR_TO_XDP_SOCK: + convert_ctx_access = bpf_xdp_sock_convert_ctx_access; + break; + case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID | PTR_UNTRUSTED: + /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike + * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot + * be said once it is marked PTR_UNTRUSTED, hence we must handle + * any faults for loads into such types. BPF_WRITE is disallowed + * for this case. + */ + case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: + case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED: + if (type == BPF_READ) { + if (BPF_MODE(insn->code) == BPF_MEM) + insn->code = BPF_LDX | BPF_PROBE_MEM | + BPF_SIZE((insn)->code); + else + insn->code = BPF_LDX | BPF_PROBE_MEMSX | + BPF_SIZE((insn)->code); + env->prog->aux->num_exentries++; + } + continue; + case PTR_TO_ARENA: + if (BPF_MODE(insn->code) == BPF_MEMSX) { + if (!bpf_jit_supports_insn(insn, true)) { + verbose(env, "sign extending loads from arena are not supported yet\n"); + return -EOPNOTSUPP; + } + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code); + } else { + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); + } + env->prog->aux->num_exentries++; + continue; + default: + continue; + } + + ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; + size = BPF_LDST_BYTES(insn); + mode = BPF_MODE(insn->code); + + /* If the read access is a narrower load of the field, + * convert to a 4/8-byte load, to minimum program type specific + * convert_ctx_access changes. If conversion is successful, + * we will apply proper mask to the result. + */ + is_narrower_load = size < ctx_field_size; + size_default = bpf_ctx_off_adjust_machine(ctx_field_size); + off = insn->off; + if (is_narrower_load) { + u8 size_code; + + if (type == BPF_WRITE) { + verifier_bug(env, "narrow ctx access misconfigured"); + return -EFAULT; + } + + size_code = BPF_H; + if (ctx_field_size == 4) + size_code = BPF_W; + else if (ctx_field_size == 8) + size_code = BPF_DW; + + insn->off = off & ~(size_default - 1); + insn->code = BPF_LDX | BPF_MEM | size_code; + } + + target_size = 0; + cnt = convert_ctx_access(type, insn, insn_buf, env->prog, + &target_size); + if (cnt == 0 || cnt >= INSN_BUF_SIZE || + (ctx_field_size && !target_size)) { + verifier_bug(env, "error during ctx access conversion (%d)", cnt); + return -EFAULT; + } + + if (is_narrower_load && size < target_size) { + u8 shift = bpf_ctx_narrow_access_offset( + off, size, size_default) * 8; + if (shift && cnt + 1 >= INSN_BUF_SIZE) { + verifier_bug(env, "narrow ctx load misconfigured"); + return -EFAULT; + } + if (ctx_field_size <= 4) { + if (shift) + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, + insn->dst_reg, + shift); + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, + (1 << size * 8) - 1); + } else { + if (shift) + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, + insn->dst_reg, + shift); + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, + (1ULL << size * 8) - 1); + } + } + if (mode == BPF_MEMSX) + insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X, + insn->dst_reg, insn->dst_reg, + size * 8, 0); + +patch_insn_buf: + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + + /* keep walking new program and skip insns we just inserted */ + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + } + + return 0; +} + +static u32 *bpf_dup_subprog_starts(struct bpf_verifier_env *env) +{ + u32 *starts = NULL; + + starts = kvmalloc_objs(u32, env->subprog_cnt, GFP_KERNEL_ACCOUNT); + if (starts) { + for (int i = 0; i < env->subprog_cnt; i++) + starts[i] = env->subprog_info[i].start; + } + return starts; +} + +static void bpf_restore_subprog_starts(struct bpf_verifier_env *env, u32 *orig_starts) +{ + for (int i = 0; i < env->subprog_cnt; i++) + env->subprog_info[i].start = orig_starts[i]; + /* restore the start of fake 'exit' subprog as well */ + env->subprog_info[env->subprog_cnt].start = env->prog->len; +} + +struct bpf_insn_aux_data *bpf_dup_insn_aux_data(struct bpf_verifier_env *env) +{ + size_t size; + void *new_aux; + + size = array_size(sizeof(struct bpf_insn_aux_data), env->prog->len); + new_aux = __vmalloc(size, GFP_KERNEL_ACCOUNT); + if (new_aux) + memcpy(new_aux, env->insn_aux_data, size); + return new_aux; +} + +void bpf_restore_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_insn_aux_data *orig_insn_aux) +{ + /* the expanded elements are zero-filled, so no special handling is required */ + vfree(env->insn_aux_data); + env->insn_aux_data = orig_insn_aux; +} + +static int jit_subprogs(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog, **func, *tmp; + int i, j, subprog_start, subprog_end = 0, len, subprog; + struct bpf_map *map_ptr; + struct bpf_insn *insn; + void *old_bpf_func; + int err, num_exentries; + + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn)) + continue; + + /* Upon error here we cannot fall back to interpreter but + * need a hard reject of the program. Thus -EFAULT is + * propagated in any case. + */ + subprog = bpf_find_subprog(env, i + insn->imm + 1); + if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d", + i + insn->imm + 1)) + return -EFAULT; + /* temporarily remember subprog id inside insn instead of + * aux_data, since next loop will split up all insns into funcs + */ + insn->off = subprog; + /* remember original imm in case JIT fails and fallback + * to interpreter will be needed + */ + env->insn_aux_data[i].call_imm = insn->imm; + /* point imm to __bpf_call_base+1 from JITs point of view */ + insn->imm = 1; + if (bpf_pseudo_func(insn)) { +#if defined(MODULES_VADDR) + u64 addr = MODULES_VADDR; +#else + u64 addr = VMALLOC_START; +#endif + /* jit (e.g. x86_64) may emit fewer instructions + * if it learns a u32 imm is the same as a u64 imm. + * Set close enough to possible prog address. + */ + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + } + } + + err = bpf_prog_alloc_jited_linfo(prog); + if (err) + goto out_undo_insn; + + err = -ENOMEM; + func = kzalloc_objs(prog, env->subprog_cnt); + if (!func) + goto out_undo_insn; + + for (i = 0; i < env->subprog_cnt; i++) { + subprog_start = subprog_end; + subprog_end = env->subprog_info[i + 1].start; + + len = subprog_end - subprog_start; + /* bpf_prog_run() doesn't call subprogs directly, + * hence main prog stats include the runtime of subprogs. + * subprogs don't have IDs and not reachable via prog_get_next_id + * func[i]->stats will never be accessed and stays NULL + */ + func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); + if (!func[i]) + goto out_free; + memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], + len * sizeof(struct bpf_insn)); + func[i]->type = prog->type; + func[i]->len = len; + if (bpf_prog_calc_tag(func[i])) + goto out_free; + func[i]->is_func = 1; + func[i]->sleepable = prog->sleepable; + func[i]->blinded = prog->blinded; + func[i]->aux->func_idx = i; + /* Below members will be freed only at prog->aux */ + func[i]->aux->btf = prog->aux->btf; + func[i]->aux->subprog_start = subprog_start; + func[i]->aux->func_info = prog->aux->func_info; + func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; + func[i]->aux->poke_tab = prog->aux->poke_tab; + func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; + func[i]->aux->main_prog_aux = prog->aux; + + for (j = 0; j < prog->aux->size_poke_tab; j++) { + struct bpf_jit_poke_descriptor *poke; + + poke = &prog->aux->poke_tab[j]; + if (poke->insn_idx < subprog_end && + poke->insn_idx >= subprog_start) + poke->aux = func[i]->aux; + } + + func[i]->aux->name[0] = 'F'; + func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; + if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) + func[i]->aux->jits_use_priv_stack = true; + + func[i]->jit_requested = 1; + func[i]->blinding_requested = prog->blinding_requested; + func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; + func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab; + func[i]->aux->linfo = prog->aux->linfo; + func[i]->aux->nr_linfo = prog->aux->nr_linfo; + func[i]->aux->jited_linfo = prog->aux->jited_linfo; + func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; + func[i]->aux->arena = prog->aux->arena; + func[i]->aux->used_maps = env->used_maps; + func[i]->aux->used_map_cnt = env->used_map_cnt; + num_exentries = 0; + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEM32 || + BPF_MODE(insn->code) == BPF_PROBE_MEM32SX || + BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) + num_exentries++; + if ((BPF_CLASS(insn->code) == BPF_STX || + BPF_CLASS(insn->code) == BPF_ST) && + BPF_MODE(insn->code) == BPF_PROBE_MEM32) + num_exentries++; + if (BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) + num_exentries++; + } + func[i]->aux->num_exentries = num_exentries; + func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; + func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; + func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; + func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; + func[i]->aux->token = prog->aux->token; + if (!i) + func[i]->aux->exception_boundary = env->seen_exception; + func[i] = bpf_int_jit_compile(env, func[i]); + if (!func[i]->jited) { + err = -ENOTSUPP; + goto out_free; + } + cond_resched(); + } + + /* at this point all bpf functions were successfully JITed + * now populate all bpf_calls with correct addresses and + * run last pass of JIT + */ + for (i = 0; i < env->subprog_cnt; i++) { + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (bpf_pseudo_func(insn)) { + subprog = insn->off; + insn[0].imm = (u32)(long)func[subprog]->bpf_func; + insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32; + continue; + } + if (!bpf_pseudo_call(insn)) + continue; + subprog = insn->off; + insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func); + } + + /* we use the aux data to keep a list of the start addresses + * of the JITed images for each function in the program + * + * for some architectures, such as powerpc64, the imm field + * might not be large enough to hold the offset of the start + * address of the callee's JITed image from __bpf_call_base + * + * in such cases, we can lookup the start address of a callee + * by using its subprog id, available from the off field of + * the call instruction, as an index for this list + */ + func[i]->aux->func = func; + func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; + func[i]->aux->real_func_cnt = env->subprog_cnt; + } + for (i = 0; i < env->subprog_cnt; i++) { + old_bpf_func = func[i]->bpf_func; + tmp = bpf_int_jit_compile(env, func[i]); + if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { + verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); + err = -ENOTSUPP; + goto out_free; + } + cond_resched(); + } + + /* + * Cleanup func[i]->aux fields which aren't required + * or can become invalid in future + */ + for (i = 0; i < env->subprog_cnt; i++) { + func[i]->aux->used_maps = NULL; + func[i]->aux->used_map_cnt = 0; + } + + /* finally lock prog and jit images for all functions and + * populate kallsysm. Begin at the first subprogram, since + * bpf_prog_load will add the kallsyms for the main program. + */ + for (i = 1; i < env->subprog_cnt; i++) { + err = bpf_prog_lock_ro(func[i]); + if (err) + goto out_free; + } + + for (i = 1; i < env->subprog_cnt; i++) + bpf_prog_kallsyms_add(func[i]); + + /* Last step: make now unused interpreter insns from main + * prog consistent for later dump requests, so they can + * later look the same as if they were interpreted only. + */ + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + insn[0].imm = env->insn_aux_data[i].call_imm; + insn[1].imm = insn->off; + insn->off = 0; + continue; + } + if (!bpf_pseudo_call(insn)) + continue; + insn->off = env->insn_aux_data[i].call_imm; + subprog = bpf_find_subprog(env, i + insn->off + 1); + insn->imm = subprog; + } + + prog->jited = 1; + prog->bpf_func = func[0]->bpf_func; + prog->jited_len = func[0]->jited_len; + prog->aux->extable = func[0]->aux->extable; + prog->aux->num_exentries = func[0]->aux->num_exentries; + prog->aux->func = func; + prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; + prog->aux->real_func_cnt = env->subprog_cnt; + prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func; + prog->aux->exception_boundary = func[0]->aux->exception_boundary; + bpf_prog_jit_attempt_done(prog); + return 0; +out_free: + /* We failed JIT'ing, so at this point we need to unregister poke + * descriptors from subprogs, so that kernel is not attempting to + * patch it anymore as we're freeing the subprog JIT memory. + */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); + } + /* At this point we're guaranteed that poke descriptors are not + * live anymore. We can just unlink its descriptor table as it's + * released with the main prog. + */ + for (i = 0; i < env->subprog_cnt; i++) { + if (!func[i]) + continue; + func[i]->aux->poke_tab = NULL; + bpf_jit_free(func[i]); + } + kfree(func); +out_undo_insn: + bpf_prog_jit_attempt_done(prog); + return err; +} + +int bpf_jit_subprogs(struct bpf_verifier_env *env) +{ + int err, i; + bool blinded = false; + struct bpf_insn *insn; + struct bpf_prog *prog, *orig_prog; + struct bpf_insn_aux_data *orig_insn_aux; + u32 *orig_subprog_starts; + + if (env->subprog_cnt <= 1) + return 0; + + prog = orig_prog = env->prog; + if (bpf_prog_need_blind(prog)) { + orig_insn_aux = bpf_dup_insn_aux_data(env); + if (!orig_insn_aux) { + err = -ENOMEM; + goto out_cleanup; + } + orig_subprog_starts = bpf_dup_subprog_starts(env); + if (!orig_subprog_starts) { + vfree(orig_insn_aux); + err = -ENOMEM; + goto out_cleanup; + } + prog = bpf_jit_blind_constants(env, prog); + if (IS_ERR(prog)) { + err = -ENOMEM; + prog = orig_prog; + goto out_restore; + } + blinded = true; + } + + err = jit_subprogs(env); + if (err) + goto out_jit_err; + + if (blinded) { + bpf_jit_prog_release_other(prog, orig_prog); + kvfree(orig_subprog_starts); + vfree(orig_insn_aux); + } + + return 0; + +out_jit_err: + if (blinded) { + bpf_jit_prog_release_other(orig_prog, prog); + /* roll back to the clean original prog */ + prog = env->prog = orig_prog; + goto out_restore; + } else { + if (err != -EFAULT) { + /* + * We will fall back to interpreter mode when err is not -EFAULT, before + * that, insn->off and insn->imm should be restored to their original + * values since they were modified by jit_subprogs. + */ + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (!bpf_pseudo_call(insn)) + continue; + insn->off = 0; + insn->imm = env->insn_aux_data[i].call_imm; + } + } + goto out_cleanup; + } + +out_restore: + bpf_restore_subprog_starts(env, orig_subprog_starts); + bpf_restore_insn_aux_data(env, orig_insn_aux); + kvfree(orig_subprog_starts); +out_cleanup: + /* cleanup main prog to be interpreted */ + prog->jit_requested = 0; + prog->blinding_requested = 0; + return err; +} + +int bpf_fixup_call_args(struct bpf_verifier_env *env) +{ +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + struct bpf_prog *prog = env->prog; + struct bpf_insn *insn = prog->insnsi; + bool has_kfunc_call = bpf_prog_has_kfunc_call(prog); + int i, depth; +#endif + int err = 0; + + if (env->prog->jit_requested && + !bpf_prog_is_offloaded(env->prog->aux)) { + err = bpf_jit_subprogs(env); + if (err == 0) + return 0; + if (err == -EFAULT) + return err; + } +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + if (has_kfunc_call) { + verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); + return -EINVAL; + } + if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { + /* When JIT fails the progs with bpf2bpf calls and tail_calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); + return -EINVAL; + } + for (i = 0; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* When JIT fails the progs with callback calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "callbacks are not allowed in non-JITed programs\n"); + return -EINVAL; + } + + if (!bpf_pseudo_call(insn)) + continue; + depth = get_callee_stack_depth(env, insn, i); + if (depth < 0) + return depth; + bpf_patch_call_args(insn, depth); + } + err = 0; +#endif + return err; +} + + +/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */ +static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len) +{ + struct bpf_subprog_info *info = env->subprog_info; + int cnt = env->subprog_cnt; + struct bpf_prog *prog; + + /* We only reserve one slot for hidden subprogs in subprog_info. */ + if (env->hidden_subprog_cnt) { + verifier_bug(env, "only one hidden subprog supported"); + return -EFAULT; + } + /* We're not patching any existing instruction, just appending the new + * ones for the hidden subprog. Hence all of the adjustment operations + * in bpf_patch_insn_data are no-ops. + */ + prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len); + if (!prog) + return -ENOMEM; + env->prog = prog; + info[cnt + 1].start = info[cnt].start; + info[cnt].start = prog->len - len + 1; + env->subprog_cnt++; + env->hidden_subprog_cnt++; + return 0; +} + +/* Do various post-verification rewrites in a single program pass. + * These rewrites simplify JIT and interpreter implementations. + */ +int bpf_do_misc_fixups(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + enum bpf_attach_type eatype = prog->expected_attach_type; + enum bpf_prog_type prog_type = resolve_prog_type(prog); + struct bpf_insn *insn = prog->insnsi; + const struct bpf_func_proto *fn; + const int insn_cnt = prog->len; + const struct bpf_map_ops *ops; + struct bpf_insn_aux_data *aux; + struct bpf_insn *insn_buf = env->insn_buf; + struct bpf_prog *new_prog; + struct bpf_map *map_ptr; + int i, ret, cnt, delta = 0, cur_subprog = 0; + struct bpf_subprog_info *subprogs = env->subprog_info; + u16 stack_depth = subprogs[cur_subprog].stack_depth; + u16 stack_depth_extra = 0; + + if (env->seen_exception && !env->exception_callback_subprog) { + struct bpf_insn *patch = insn_buf; + + *patch++ = env->prog->insnsi[insn_cnt - 1]; + *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); + *patch++ = BPF_EXIT_INSN(); + ret = add_hidden_subprog(env, insn_buf, patch - insn_buf); + if (ret < 0) + return ret; + prog = env->prog; + insn = prog->insnsi; + + env->exception_callback_subprog = env->subprog_cnt - 1; + /* Don't update insn_cnt, as add_hidden_subprog always appends insns */ + bpf_mark_subprog_exc_cb(env, env->exception_callback_subprog); + } + + for (i = 0; i < insn_cnt;) { + if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) { + if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) || + (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) { + /* convert to 32-bit mov that clears upper 32-bit */ + insn->code = BPF_ALU | BPF_MOV | BPF_X; + /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */ + insn->off = 0; + insn->imm = 0; + } /* cast from as(0) to as(1) should be handled by JIT */ + goto next_insn; + } + + if (env->insn_aux_data[i + delta].needs_zext) + /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */ + insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code); + + /* Make sdiv/smod divide-by-minus-one exceptions impossible. */ + if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) || + insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) || + insn->code == (BPF_ALU | BPF_MOD | BPF_K) || + insn->code == (BPF_ALU | BPF_DIV | BPF_K)) && + insn->off == 1 && insn->imm == -1) { + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + bool isdiv = BPF_OP(insn->code) == BPF_DIV; + struct bpf_insn *patch = insn_buf; + + if (isdiv) + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_NEG | BPF_K, insn->dst_reg, + 0, 0, 0); + else + *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0); + + cnt = patch - insn_buf; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */ + if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + bool isdiv = BPF_OP(insn->code) == BPF_DIV; + bool is_sdiv = isdiv && insn->off == 1; + bool is_smod = !isdiv && insn->off == 1; + struct bpf_insn *patch = insn_buf; + + if (is_sdiv) { + /* [R,W]x sdiv 0 -> 0 + * LLONG_MIN sdiv -1 -> LLONG_MIN + * INT_MIN sdiv -1 -> INT_MIN + */ + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_ADD | BPF_K, BPF_REG_AX, + 0, 0, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JGT | BPF_K, BPF_REG_AX, + 0, 4, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, BPF_REG_AX, + 0, 1, 0); + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_MOV | BPF_K, insn->dst_reg, + 0, 0, 0); + /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */ + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_NEG | BPF_K, insn->dst_reg, + 0, 0, 0); + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = *insn; + cnt = patch - insn_buf; + } else if (is_smod) { + /* [R,W]x mod 0 -> [R,W]x */ + /* [R,W]x mod -1 -> 0 */ + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_ADD | BPF_K, BPF_REG_AX, + 0, 0, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JGT | BPF_K, BPF_REG_AX, + 0, 3, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, BPF_REG_AX, + 0, 3 + (is64 ? 0 : 1), 1); + *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0); + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = *insn; + + if (!is64) { + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg); + } + cnt = patch - insn_buf; + } else if (isdiv) { + /* [R,W]x div 0 -> 0 */ + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JNE | BPF_K, insn->src_reg, + 0, 2, 0); + *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg); + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = *insn; + cnt = patch - insn_buf; + } else { + /* [R,W]x mod 0 -> [R,W]x */ + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, insn->src_reg, + 0, 1 + (is64 ? 0 : 1), 0); + *patch++ = *insn; + + if (!is64) { + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg); + } + cnt = patch - insn_buf; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + /* Make it impossible to de-reference a userspace address */ + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) == BPF_PROBE_MEM || + BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) { + struct bpf_insn *patch = insn_buf; + u64 uaddress_limit = bpf_arch_uaddress_limit(); + + if (!uaddress_limit) + goto next_insn; + + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + if (insn->off) + *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off); + *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32); + *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2); + *patch++ = *insn; + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0); + + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ + if (BPF_CLASS(insn->code) == BPF_LD && + (BPF_MODE(insn->code) == BPF_ABS || + BPF_MODE(insn->code) == BPF_IND)) { + cnt = env->ops->gen_ld_abs(insn, insn_buf); + if (cnt == 0 || cnt >= INSN_BUF_SIZE) { + verifier_bug(env, "%d insns generated for ld_abs", cnt); + return -EFAULT; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + /* Rewrite pointer arithmetic to mitigate speculation attacks. */ + if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { + const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; + const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; + struct bpf_insn *patch = insn_buf; + bool issrc, isneg, isimm; + u32 off_reg; + + aux = &env->insn_aux_data[i + delta]; + if (!aux->alu_state || + aux->alu_state == BPF_ALU_NON_POINTER) + goto next_insn; + + isneg = aux->alu_state & BPF_ALU_NEG_VALUE; + issrc = (aux->alu_state & BPF_ALU_SANITIZE) == + BPF_ALU_SANITIZE_SRC; + isimm = aux->alu_state & BPF_ALU_IMMEDIATE; + + off_reg = issrc ? insn->src_reg : insn->dst_reg; + if (isimm) { + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); + } else { + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg); + } + if (!issrc) + *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg); + insn->src_reg = BPF_REG_AX; + if (isneg) + insn->code = insn->code == code_add ? + code_sub : code_add; + *patch++ = *insn; + if (issrc && isneg && !isimm) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + cnt = patch - insn_buf; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + if (bpf_is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) { + int stack_off_cnt = -stack_depth - 16; + + /* + * Two 8 byte slots, depth-16 stores the count, and + * depth-8 stores the start timestamp of the loop. + * + * The starting value of count is BPF_MAX_TIMED_LOOPS + * (0xffff). Every iteration loads it and subs it by 1, + * until the value becomes 0 in AX (thus, 1 in stack), + * after which we call arch_bpf_timed_may_goto, which + * either sets AX to 0xffff to keep looping, or to 0 + * upon timeout. AX is then stored into the stack. In + * the next iteration, we either see 0 and break out, or + * continue iterating until the next time value is 0 + * after subtraction, rinse and repeat. + */ + stack_depth_extra = 16; + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt); + if (insn->off >= 0) + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5); + else + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); + insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); + insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2); + /* + * AX is used as an argument to pass in stack_off_cnt + * (to add to r10/fp), and also as the return value of + * the call to arch_bpf_timed_may_goto. + */ + insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt); + insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto); + insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt); + cnt = 7; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } else if (bpf_is_may_goto_insn(insn)) { + int stack_off = -stack_depth - 8; + + stack_depth_extra = 8; + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off); + if (insn->off >= 0) + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); + else + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); + insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); + insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off); + cnt = 4; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + if (insn->code != (BPF_JMP | BPF_CALL)) + goto next_insn; + if (insn->src_reg == BPF_PSEUDO_CALL) + goto next_insn; + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + ret = bpf_fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt); + if (ret) + return ret; + if (cnt == 0) + goto next_insn; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + /* Skip inlining the helper call if the JIT does it. */ + if (bpf_jit_inlines_helper_call(insn->imm)) + goto next_insn; + + if (insn->imm == BPF_FUNC_get_route_realm) + prog->dst_needed = 1; + if (insn->imm == BPF_FUNC_get_prandom_u32) + bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_override_return) + prog->kprobe_override = 1; + if (insn->imm == BPF_FUNC_tail_call) { + /* If we tail call into other programs, we + * cannot make any assumptions since they can + * be replaced dynamically during runtime in + * the program array. + */ + prog->cb_access = 1; + if (!bpf_allow_tail_call_in_subprogs(env)) + prog->aux->stack_depth = MAX_BPF_STACK; + prog->aux->max_pkt_offset = MAX_PACKET_OFF; + + /* mark bpf_tail_call as different opcode to avoid + * conditional branch in the interpreter for every normal + * call and to prevent accidental JITing by JIT compiler + * that doesn't support bpf_tail_call yet + */ + insn->imm = 0; + insn->code = BPF_JMP | BPF_TAIL_CALL; + + aux = &env->insn_aux_data[i + delta]; + if (env->bpf_capable && !prog->blinding_requested && + prog->jit_requested && + !bpf_map_key_poisoned(aux) && + !bpf_map_ptr_poisoned(aux) && + !bpf_map_ptr_unpriv(aux)) { + struct bpf_jit_poke_descriptor desc = { + .reason = BPF_POKE_REASON_TAIL_CALL, + .tail_call.map = aux->map_ptr_state.map_ptr, + .tail_call.key = bpf_map_key_immediate(aux), + .insn_idx = i + delta, + }; + + ret = bpf_jit_add_poke_descriptor(prog, &desc); + if (ret < 0) { + verbose(env, "adding tail call poke descriptor failed\n"); + return ret; + } + + insn->imm = ret + 1; + goto next_insn; + } + + if (!bpf_map_ptr_unpriv(aux)) + goto next_insn; + + /* instead of changing every JIT dealing with tail_call + * emit two extra insns: + * if (index >= max_entries) goto out; + * index &= array->index_mask; + * to avoid out-of-bounds cpu speculation + */ + if (bpf_map_ptr_poisoned(aux)) { + verbose(env, "tail_call abusing map_ptr\n"); + return -EINVAL; + } + + map_ptr = aux->map_ptr_state.map_ptr; + insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, + map_ptr->max_entries, 2); + insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, + container_of(map_ptr, + struct bpf_array, + map)->index_mask); + insn_buf[2] = *insn; + cnt = 3; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + if (insn->imm == BPF_FUNC_timer_set_callback) { + /* The verifier will process callback_fn as many times as necessary + * with different maps and the register states prepared by + * set_timer_callback_state will be accurate. + * + * The following use case is valid: + * map1 is shared by prog1, prog2, prog3. + * prog1 calls bpf_timer_init for some map1 elements + * prog2 calls bpf_timer_set_callback for some map1 elements. + * Those that were not bpf_timer_init-ed will return -EINVAL. + * prog3 calls bpf_timer_start for some map1 elements. + * Those that were not both bpf_timer_init-ed and + * bpf_timer_set_callback-ed will return -EINVAL. + */ + struct bpf_insn ld_addrs[2] = { + BPF_LD_IMM64(BPF_REG_3, (long)prog->aux), + }; + + insn_buf[0] = ld_addrs[0]; + insn_buf[1] = ld_addrs[1]; + insn_buf[2] = *insn; + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + + /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */ + if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) { + /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data, + * bpf_mem_alloc() returns a ptr to the percpu data ptr. + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + insn_buf[1] = *insn; + cnt = 2; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup + * and other inlining handlers are currently limited to 64 bit + * only. + */ + if (prog->jit_requested && BITS_PER_LONG == 64 && + (insn->imm == BPF_FUNC_map_lookup_elem || + insn->imm == BPF_FUNC_map_update_elem || + insn->imm == BPF_FUNC_map_delete_elem || + insn->imm == BPF_FUNC_map_push_elem || + insn->imm == BPF_FUNC_map_pop_elem || + insn->imm == BPF_FUNC_map_peek_elem || + insn->imm == BPF_FUNC_redirect_map || + insn->imm == BPF_FUNC_for_each_map_elem || + insn->imm == BPF_FUNC_map_lookup_percpu_elem)) { + aux = &env->insn_aux_data[i + delta]; + if (bpf_map_ptr_poisoned(aux)) + goto patch_call_imm; + + map_ptr = aux->map_ptr_state.map_ptr; + ops = map_ptr->ops; + if (insn->imm == BPF_FUNC_map_lookup_elem && + ops->map_gen_lookup) { + cnt = ops->map_gen_lookup(map_ptr, insn_buf); + if (cnt == -EOPNOTSUPP) + goto patch_map_ops_generic; + if (cnt <= 0 || cnt >= INSN_BUF_SIZE) { + verifier_bug(env, "%d insns generated for map lookup", cnt); + return -EFAULT; + } + + new_prog = bpf_patch_insn_data(env, i + delta, + insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_delete_elem, + (long (*)(struct bpf_map *map, void *key))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_update_elem, + (long (*)(struct bpf_map *map, void *key, void *value, + u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_push_elem, + (long (*)(struct bpf_map *map, void *value, + u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_pop_elem, + (long (*)(struct bpf_map *map, void *value))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_peek_elem, + (long (*)(struct bpf_map *map, void *value))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_redirect, + (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_for_each_callback, + (long (*)(struct bpf_map *map, + bpf_callback_t callback_fn, + void *callback_ctx, + u64 flags))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem, + (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL)); + +patch_map_ops_generic: + switch (insn->imm) { + case BPF_FUNC_map_lookup_elem: + insn->imm = BPF_CALL_IMM(ops->map_lookup_elem); + goto next_insn; + case BPF_FUNC_map_update_elem: + insn->imm = BPF_CALL_IMM(ops->map_update_elem); + goto next_insn; + case BPF_FUNC_map_delete_elem: + insn->imm = BPF_CALL_IMM(ops->map_delete_elem); + goto next_insn; + case BPF_FUNC_map_push_elem: + insn->imm = BPF_CALL_IMM(ops->map_push_elem); + goto next_insn; + case BPF_FUNC_map_pop_elem: + insn->imm = BPF_CALL_IMM(ops->map_pop_elem); + goto next_insn; + case BPF_FUNC_map_peek_elem: + insn->imm = BPF_CALL_IMM(ops->map_peek_elem); + goto next_insn; + case BPF_FUNC_redirect_map: + insn->imm = BPF_CALL_IMM(ops->map_redirect); + goto next_insn; + case BPF_FUNC_for_each_map_elem: + insn->imm = BPF_CALL_IMM(ops->map_for_each_callback); + goto next_insn; + case BPF_FUNC_map_lookup_percpu_elem: + insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem); + goto next_insn; + } + + goto patch_call_imm; + } + + /* Implement bpf_jiffies64 inline. */ + if (prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_jiffies64) { + struct bpf_insn ld_jiffies_addr[2] = { + BPF_LD_IMM64(BPF_REG_0, + (unsigned long)&jiffies), + }; + + insn_buf[0] = ld_jiffies_addr[0]; + insn_buf[1] = ld_jiffies_addr[1]; + insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, + BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, + cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) + /* Implement bpf_get_smp_processor_id() inline. */ + if (insn->imm == BPF_FUNC_get_smp_processor_id && + bpf_verifier_inlines_helper_call(env, insn->imm)) { + /* BPF_FUNC_get_smp_processor_id inlining is an + * optimization, so if cpu_number is ever + * changed in some incompatible and hard to support + * way, it's fine to back out this inlining logic + */ +#ifdef CONFIG_SMP + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number); + insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); + cnt = 3; +#else + insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + cnt = 1; +#endif + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + /* Implement bpf_get_current_task() and bpf_get_current_task_btf() inline. */ + if ((insn->imm == BPF_FUNC_get_current_task || insn->imm == BPF_FUNC_get_current_task_btf) && + bpf_verifier_inlines_helper_call(env, insn->imm)) { + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)¤t_task); + insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + cnt = 3; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } +#endif + /* Implement bpf_get_func_arg inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_arg) { + if (eatype == BPF_TRACE_RAW_TP) { + int nr_args = btf_type_vlen(prog->aux->attach_func_proto); + + /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); + cnt = 1; + } else { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + cnt = 2; + } + insn_buf[cnt++] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); + insn_buf[cnt++] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); + insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0); + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 0); + insn_buf[cnt++] = BPF_JMP_A(1); + insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + /* Implement bpf_get_func_ret inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_ret) { + if (eatype == BPF_TRACE_FEXIT || + eatype == BPF_TRACE_FSESSION || + eatype == BPF_MODIFY_RETURN) { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); + insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0); + insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0); + cnt = 7; + } else { + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP); + cnt = 1; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + /* Implement get_func_arg_cnt inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_arg_cnt) { + if (eatype == BPF_TRACE_RAW_TP) { + int nr_args = btf_type_vlen(prog->aux->attach_func_proto); + + /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); + cnt = 1; + } else { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + cnt = 2; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + /* Implement bpf_get_func_ip inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_ip) { + /* Load IP address from ctx - 16 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16); + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); + if (!new_prog) + return -ENOMEM; + + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + /* Implement bpf_get_branch_snapshot inline. */ + if (IS_ENABLED(CONFIG_PERF_EVENTS) && + prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_get_branch_snapshot) { + /* We are dealing with the following func protos: + * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags); + * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt); + */ + const u32 br_entry_size = sizeof(struct perf_branch_entry); + + /* struct perf_branch_entry is part of UAPI and is + * used as an array element, so extremely unlikely to + * ever grow or shrink + */ + BUILD_BUG_ON(br_entry_size != 24); + + /* if (unlikely(flags)) return -EINVAL */ + insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7); + + /* Transform size (bytes) into number of entries (cnt = size / 24). + * But to avoid expensive division instruction, we implement + * divide-by-3 through multiplication, followed by further + * division by 8 through 3-bit right shift. + * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr., + * p. 227, chapter "Unsigned Division by 3" for details and proofs. + * + * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab. + */ + insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab); + insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0); + insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36); + + /* call perf_snapshot_branch_stack implementation */ + insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack)); + /* if (entry_cnt == 0) return -ENOENT */ + insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4); + /* return entry_cnt * sizeof(struct perf_branch_entry) */ + insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size); + insn_buf[7] = BPF_JMP_A(3); + /* return -EINVAL; */ + insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); + insn_buf[9] = BPF_JMP_A(1); + /* return -ENOENT; */ + insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT); + cnt = 11; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } + + /* Implement bpf_kptr_xchg inline */ + if (prog->jit_requested && BITS_PER_LONG == 64 && + insn->imm == BPF_FUNC_kptr_xchg && + bpf_jit_supports_ptr_xchg()) { + insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2); + insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0); + cnt = 2; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } +patch_call_imm: + fn = env->ops->get_func_proto(insn->imm, env->prog); + /* all functions that have prototype and verifier allowed + * programs to call them, must be real in-kernel functions + */ + if (!fn->func) { + verifier_bug(env, + "not inlined functions %s#%d is missing func", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } + insn->imm = fn->func - __bpf_call_base; +next_insn: + if (subprogs[cur_subprog + 1].start == i + delta + 1) { + subprogs[cur_subprog].stack_depth += stack_depth_extra; + subprogs[cur_subprog].stack_extra = stack_depth_extra; + + stack_depth = subprogs[cur_subprog].stack_depth; + if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) { + verbose(env, "stack size %d(extra %d) is too large\n", + stack_depth, stack_depth_extra); + return -EINVAL; + } + cur_subprog++; + stack_depth = subprogs[cur_subprog].stack_depth; + stack_depth_extra = 0; + } + i++; + insn++; + } + + env->prog->aux->stack_depth = subprogs[0].stack_depth; + for (i = 0; i < env->subprog_cnt; i++) { + int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1; + int subprog_start = subprogs[i].start; + int stack_slots = subprogs[i].stack_extra / 8; + int slots = delta, cnt = 0; + + if (!stack_slots) + continue; + /* We need two slots in case timed may_goto is supported. */ + if (stack_slots > slots) { + verifier_bug(env, "stack_slots supports may_goto only"); + return -EFAULT; + } + + stack_depth = subprogs[i].stack_depth; + if (bpf_jit_supports_timed_may_goto()) { + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_TIMED_LOOPS); + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0); + } else { + /* Add ST insn to subprog prologue to init extra stack */ + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_LOOPS); + } + /* Copy first actual insn to preserve it */ + insn_buf[cnt++] = env->prog->insnsi[subprog_start]; + + new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + env->prog = prog = new_prog; + /* + * If may_goto is a first insn of a prog there could be a jmp + * insn that points to it, hence adjust all such jmps to point + * to insn after BPF_ST that inits may_goto count. + * Adjustment will succeed because bpf_patch_insn_data() didn't fail. + */ + WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta)); + } + + /* Since poke tab is now finalized, publish aux to tracker. */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + if (!map_ptr->ops->map_poke_track || + !map_ptr->ops->map_poke_untrack || + !map_ptr->ops->map_poke_run) { + verifier_bug(env, "poke tab is misconfigured"); + return -EFAULT; + } + + ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux); + if (ret < 0) { + verbose(env, "tracking tail call prog failed\n"); + return ret; + } + } + + ret = sort_kfunc_descs_by_imm_off(env); + if (ret) + return ret; + + return 0; +} + +static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, + int position, + s32 stack_base, + u32 callback_subprogno, + u32 *total_cnt) +{ + s32 r6_offset = stack_base + 0 * BPF_REG_SIZE; + s32 r7_offset = stack_base + 1 * BPF_REG_SIZE; + s32 r8_offset = stack_base + 2 * BPF_REG_SIZE; + int reg_loop_max = BPF_REG_6; + int reg_loop_cnt = BPF_REG_7; + int reg_loop_ctx = BPF_REG_8; + + struct bpf_insn *insn_buf = env->insn_buf; + struct bpf_prog *new_prog; + u32 callback_start; + u32 call_insn_offset; + s32 callback_offset; + u32 cnt = 0; + + /* This represents an inlined version of bpf_iter.c:bpf_loop, + * be careful to modify this code in sync. + */ + + /* Return error and jump to the end of the patch if + * expected number of iterations is too big. + */ + insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2); + insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG); + insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16); + /* spill R6, R7, R8 to use these as loop vars */ + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset); + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset); + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset); + /* initialize loop vars */ + insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1); + insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0); + insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3); + /* loop header, + * if reg_loop_cnt >= reg_loop_max skip the loop body + */ + insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5); + /* callback call, + * correct callback offset would be set after patching + */ + insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt); + insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx); + insn_buf[cnt++] = BPF_CALL_REL(0); + /* increment loop counter */ + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1); + /* jump to loop header if callback returned 0 */ + insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6); + /* return value of bpf_loop, + * set R0 to the number of iterations + */ + insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt); + /* restore original values of R6, R7, R8 */ + insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset); + insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset); + insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset); + + *total_cnt = cnt; + new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt); + if (!new_prog) + return new_prog; + + /* callback start is known only after patching */ + callback_start = env->subprog_info[callback_subprogno].start; + /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */ + call_insn_offset = position + 12; + callback_offset = callback_start - call_insn_offset - 1; + new_prog->insnsi[call_insn_offset].imm = callback_offset; + + return new_prog; +} + +static bool is_bpf_loop_call(struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0 && + insn->imm == BPF_FUNC_loop; +} + +/* For all sub-programs in the program (including main) check + * insn_aux_data to see if there are bpf_loop calls that require + * inlining. If such calls are found the calls are replaced with a + * sequence of instructions produced by `inline_bpf_loop` function and + * subprog stack_depth is increased by the size of 3 registers. + * This stack space is used to spill values of the R6, R7, R8. These + * registers are used to store the loop bound, counter and context + * variables. + */ +int bpf_optimize_bpf_loop(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprogs = env->subprog_info; + int i, cur_subprog = 0, cnt, delta = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + u16 stack_depth = subprogs[cur_subprog].stack_depth; + u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; + u16 stack_depth_extra = 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + struct bpf_loop_inline_state *inline_state = + &env->insn_aux_data[i + delta].loop_inline_state; + + if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { + struct bpf_prog *new_prog; + + stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup; + new_prog = inline_bpf_loop(env, + i + delta, + -(stack_depth + stack_depth_extra), + inline_state->callback_subprogno, + &cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + } + + if (subprogs[cur_subprog + 1].start == i + delta + 1) { + subprogs[cur_subprog].stack_depth += stack_depth_extra; + cur_subprog++; + stack_depth = subprogs[cur_subprog].stack_depth; + stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; + stack_depth_extra = 0; + } + } + + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; + + return 0; +} + +/* Remove unnecessary spill/fill pairs, members of fastcall pattern, + * adjust subprograms stack depth when possible. + */ +int bpf_remove_fastcall_spills_fills(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *subprog = env->subprog_info; + struct bpf_insn_aux_data *aux = env->insn_aux_data; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + u32 spills_num; + bool modified = false; + int i, j; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (aux[i].fastcall_spills_num > 0) { + spills_num = aux[i].fastcall_spills_num; + /* NOPs would be removed by opt_remove_nops() */ + for (j = 1; j <= spills_num; ++j) { + *(insn - j) = NOP; + *(insn + j) = NOP; + } + modified = true; + } + if ((subprog + 1)->start == i + 1) { + if (modified && !subprog->keep_fastcall_stack) + subprog->stack_depth = -subprog->fastcall_stack_off; + subprog++; + modified = false; + } + } + + return 0; +} + diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3b9d297a53be..3dd9b4924ae4 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -125,6 +125,11 @@ struct htab_elem { char key[] __aligned(8); }; +struct htab_btf_record { + struct btf_record *record; + u32 key_size; +}; + static inline bool htab_is_prealloc(const struct bpf_htab *htab) { return !(htab->map.map_flags & BPF_F_NO_PREALLOC); @@ -457,6 +462,83 @@ static int htab_map_alloc_check(union bpf_attr *attr) return 0; } +static void htab_mem_dtor(void *obj, void *ctx) +{ + struct htab_btf_record *hrec = ctx; + struct htab_elem *elem = obj; + void *map_value; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + map_value = htab_elem_value(elem, hrec->key_size); + bpf_obj_free_fields(hrec->record, map_value); +} + +static void htab_pcpu_mem_dtor(void *obj, void *ctx) +{ + void __percpu *pptr = *(void __percpu **)obj; + struct htab_btf_record *hrec = ctx; + int cpu; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + for_each_possible_cpu(cpu) + bpf_obj_free_fields(hrec->record, per_cpu_ptr(pptr, cpu)); +} + +static void htab_dtor_ctx_free(void *ctx) +{ + struct htab_btf_record *hrec = ctx; + + btf_record_free(hrec->record); + kfree(ctx); +} + +static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *)) +{ + u32 key_size = htab->map.key_size; + struct bpf_mem_alloc *ma; + struct htab_btf_record *hrec; + int err; + + /* No need for dtors. */ + if (IS_ERR_OR_NULL(htab->map.record)) + return 0; + + hrec = kzalloc(sizeof(*hrec), GFP_KERNEL); + if (!hrec) + return -ENOMEM; + hrec->key_size = key_size; + hrec->record = btf_record_dup(htab->map.record); + if (IS_ERR(hrec->record)) { + err = PTR_ERR(hrec->record); + kfree(hrec); + return err; + } + ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma; + bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec); + return 0; +} + +static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, const struct btf_type *value_type) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + if (htab_is_prealloc(htab)) + return 0; + /* + * We must set the dtor using this callback, as map's BTF record is not + * populated in htab_map_alloc(), so it will always appear as NULL. + */ + if (htab_is_percpu(htab)) + return htab_set_dtor(htab, htab_pcpu_mem_dtor); + else + return htab_set_dtor(htab, htab_mem_dtor); +} + static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || @@ -974,7 +1056,7 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, for_each_possible_cpu(cpu) { if (cpu == current_cpu) - copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value); + copy_map_value(&htab->map, per_cpu_ptr(pptr, cpu), value); else /* Since elem is preallocated, we cannot touch special fields */ zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu)); } @@ -1056,6 +1138,10 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } else if (fd_htab_map_needs_adjust(htab)) { size = round_up(size, 8); memcpy(htab_elem_value(l_new, key_size), value, size); + } else if (map_flags & BPF_F_LOCK) { + copy_map_value_locked(&htab->map, + htab_elem_value(l_new, key_size), + value, false); } else { copy_map_value(&htab->map, htab_elem_value(l_new, key_size), value); } @@ -2281,6 +2367,7 @@ const struct bpf_map_ops htab_map_ops = { .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], @@ -2303,6 +2390,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru), .map_btf_id = &htab_map_btf_ids[0], @@ -2482,6 +2570,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_percpu), .map_btf_id = &htab_map_btf_ids[0], @@ -2502,6 +2591,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru_percpu), .map_btf_id = &htab_map_btf_ids[0], diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6eb6c82ed2ee..2bb60200c266 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -845,7 +845,13 @@ int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, data->buf = buffers->buf; for (i = 0; i < fmt_size; i++) { - if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + unsigned char c = fmt[i]; + + /* + * Permit bytes >= 0x80 in plain text so UTF-8 literals can pass + * through unchanged, while still rejecting ASCII control bytes. + */ + if (isascii(c) && !isprint(c) && !isspace(c)) { err = -EINVAL; goto out; } @@ -867,6 +873,15 @@ int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, * always access fmt[i + 1], in the worst case it will be a 0 */ i++; + c = fmt[i]; + /* + * The format parser below only understands ASCII conversion + * specifiers and modifiers, so reject non-ASCII after '%'. + */ + if (!isascii(c)) { + err = -EINVAL; + goto out; + } /* skip optional "[0 +-][num]" width formatting field */ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || @@ -1272,7 +1287,7 @@ static void bpf_async_cb_rcu_tasks_trace_free(struct rcu_head *rcu) return; } - /* rcu_trace_implies_rcu_gp() is true and will remain so */ + /* RCU Tasks Trace grace period implies RCU grace period. */ bpf_async_cb_rcu_free(rcu); } @@ -2302,9 +2317,20 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, __bpf_kfunc_start_defs(); -__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) +/** + * bpf_obj_new() - allocate an object described by program BTF + * @local_type_id__k: type ID in program BTF + * @meta: verifier-supplied struct metadata + * + * Allocate an object of the type identified by @local_type_id__k and + * initialize its special fields. BPF programs can use + * bpf_core_type_id_local() to provide @local_type_id__k. The verifier + * rewrites @meta; BPF programs do not set it. + * + * Return: Pointer to the allocated object, or %NULL on failure. + */ +__bpf_kfunc void *bpf_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta) { - struct btf_struct_meta *meta = meta__ign; u64 size = local_type_id__k; void *p; @@ -2313,17 +2339,39 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) return NULL; if (meta) bpf_obj_init(meta->record, p); + return p; } -__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) +__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) +{ + return bpf_obj_new(local_type_id__k, meta__ign); +} + +/** + * bpf_percpu_obj_new() - allocate a percpu object described by program BTF + * @local_type_id__k: type ID in program BTF + * @meta: verifier-supplied struct metadata + * + * Allocate a percpu object of the type identified by @local_type_id__k. BPF + * programs can use bpf_core_type_id_local() to provide @local_type_id__k. + * The verifier rewrites @meta; BPF programs do not set it. + * + * Return: Pointer to the allocated percpu object, or %NULL on failure. + */ +__bpf_kfunc void *bpf_percpu_obj_new(u64 local_type_id__k, struct btf_struct_meta *meta) { u64 size = local_type_id__k; - /* The verifier has ensured that meta__ign must be NULL */ + /* The verifier has ensured that meta must be NULL */ return bpf_mem_alloc(&bpf_global_percpu_ma, size); } +__bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) +{ + return bpf_percpu_obj_new(local_type_id__k, meta__ign); +} + /* Must be called under migrate_disable(), as required by bpf_mem_free */ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) { @@ -2347,23 +2395,56 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu) bpf_mem_free_rcu(ma, p); } -__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) +/** + * bpf_obj_drop() - drop a previously allocated object + * @p__alloc: object to free + * @meta: verifier-supplied struct metadata + * + * Destroy special fields in @p__alloc as needed and free the object. The + * verifier rewrites @meta; BPF programs do not set it. + */ +__bpf_kfunc void bpf_obj_drop(void *p__alloc, struct btf_struct_meta *meta) { - struct btf_struct_meta *meta = meta__ign; void *p = p__alloc; __bpf_obj_drop_impl(p, meta ? meta->record : NULL, false); } -__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) +__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) +{ + return bpf_obj_drop(p__alloc, meta__ign); +} + +/** + * bpf_percpu_obj_drop() - drop a previously allocated percpu object + * @p__alloc: percpu object to free + * @meta: verifier-supplied struct metadata + * + * Free @p__alloc. The verifier rewrites @meta; BPF programs do not set it. + */ +__bpf_kfunc void bpf_percpu_obj_drop(void *p__alloc, struct btf_struct_meta *meta) { - /* The verifier has ensured that meta__ign must be NULL */ + /* The verifier has ensured that meta must be NULL */ bpf_mem_free_rcu(&bpf_global_percpu_ma, p__alloc); } -__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) +__bpf_kfunc void bpf_percpu_obj_drop_impl(void *p__alloc, void *meta__ign) +{ + bpf_percpu_obj_drop(p__alloc, meta__ign); +} + +/** + * bpf_refcount_acquire() - turn a local kptr into an owning reference + * @p__refcounted_kptr: non-owning local kptr + * @meta: verifier-supplied struct metadata + * + * Increment the refcount for @p__refcounted_kptr. The verifier rewrites + * @meta; BPF programs do not set it. + * + * Return: Owning reference to @p__refcounted_kptr, or %NULL on failure. + */ +__bpf_kfunc void *bpf_refcount_acquire(void *p__refcounted_kptr, struct btf_struct_meta *meta) { - struct btf_struct_meta *meta = meta__ign; struct bpf_refcount *ref; /* Could just cast directly to refcount_t *, but need some code using @@ -2379,6 +2460,11 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta return (void *)p__refcounted_kptr; } +__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign) +{ + return bpf_refcount_acquire(p__refcounted_kptr, meta__ign); +} + static int __bpf_list_add(struct bpf_list_node_kern *node, struct bpf_list_head *head, bool tail, struct btf_record *rec, u64 off) @@ -2406,24 +2492,62 @@ static int __bpf_list_add(struct bpf_list_node_kern *node, return 0; } +/** + * bpf_list_push_front() - add a node to the front of a BPF linked list + * @head: list head + * @node: node to insert + * @meta: verifier-supplied struct metadata + * @off: verifier-supplied offset of @node within the containing object + * + * Insert @node at the front of @head. The verifier rewrites @meta and @off; + * BPF programs do not set them. + * + * Return: 0 on success, or %-EINVAL if @node is already linked. + */ +__bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head, + struct bpf_list_node *node, + struct btf_struct_meta *meta, + u64 off) +{ + struct bpf_list_node_kern *n = (void *)node; + + return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); +} + __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { + return bpf_list_push_front(head, node, meta__ign, off); +} + +/** + * bpf_list_push_back() - add a node to the back of a BPF linked list + * @head: list head + * @node: node to insert + * @meta: verifier-supplied struct metadata + * @off: verifier-supplied offset of @node within the containing object + * + * Insert @node at the back of @head. The verifier rewrites @meta and @off; + * BPF programs do not set them. + * + * Return: 0 on success, or %-EINVAL if @node is already linked. + */ +__bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head, + struct bpf_list_node *node, + struct btf_struct_meta *meta, + u64 off) +{ struct bpf_list_node_kern *n = (void *)node; - struct btf_struct_meta *meta = meta__ign; - return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, struct bpf_list_node *node, void *meta__ign, u64 off) { - struct bpf_list_node_kern *n = (void *)node; - struct btf_struct_meta *meta = meta__ign; - - return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); + return bpf_list_push_back(head, node, meta__ign, off); } static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) @@ -2535,16 +2659,37 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, return 0; } -__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, - bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), - void *meta__ign, u64 off) +/** + * bpf_rbtree_add() - add a node to a BPF rbtree + * @root: tree root + * @node: node to insert + * @less: comparator used to order nodes + * @meta: verifier-supplied struct metadata + * @off: verifier-supplied offset of @node within the containing object + * + * Insert @node into @root using @less. The verifier rewrites @meta and @off; + * BPF programs do not set them. + * + * Return: 0 on success, or %-EINVAL if @node is already linked in a tree. + */ +__bpf_kfunc int bpf_rbtree_add(struct bpf_rb_root *root, + struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), + struct btf_struct_meta *meta, + u64 off) { - struct btf_struct_meta *meta = meta__ign; struct bpf_rb_node_kern *n = (void *)node; return __bpf_rbtree_add(root, n, (void *)less, meta ? meta->record : NULL, off); } +__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), + void *meta__ign, u64 off) +{ + return bpf_rbtree_add(root, node, less, meta__ign, off); +} + __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) { struct rb_root_cached *r = (struct rb_root_cached *)root; @@ -4165,17 +4310,25 @@ static bool bpf_task_work_ctx_tryget(struct bpf_task_work_ctx *ctx) return refcount_inc_not_zero(&ctx->refcnt); } +static void bpf_task_work_destroy(struct irq_work *irq_work) +{ + struct bpf_task_work_ctx *ctx = container_of(irq_work, struct bpf_task_work_ctx, irq_work); + + bpf_task_work_ctx_reset(ctx); + kfree_rcu(ctx, rcu); +} + static void bpf_task_work_ctx_put(struct bpf_task_work_ctx *ctx) { if (!refcount_dec_and_test(&ctx->refcnt)) return; - bpf_task_work_ctx_reset(ctx); - - /* bpf_mem_free expects migration to be disabled */ - migrate_disable(); - bpf_mem_free(&bpf_global_ma, ctx); - migrate_enable(); + if (irqs_disabled()) { + ctx->irq_work = IRQ_WORK_INIT(bpf_task_work_destroy); + irq_work_queue(&ctx->irq_work); + } else { + bpf_task_work_destroy(&ctx->irq_work); + } } static void bpf_task_work_cancel(struct bpf_task_work_ctx *ctx) @@ -4229,7 +4382,7 @@ static void bpf_task_work_irq(struct irq_work *irq_work) enum bpf_task_work_state state; int err; - guard(rcu_tasks_trace)(); + guard(rcu)(); if (cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING) != BPF_TW_PENDING) { bpf_task_work_ctx_put(ctx); @@ -4251,9 +4404,9 @@ static void bpf_task_work_irq(struct irq_work *irq_work) /* * It's technically possible for just scheduled task_work callback to * complete running by now, going SCHEDULING -> RUNNING and then - * dropping its ctx refcount. Instead of capturing extra ref just to - * protected below ctx->state access, we rely on RCU protection to - * perform below SCHEDULING -> SCHEDULED attempt. + * dropping its ctx refcount. Instead of capturing an extra ref just + * to protect below ctx->state access, we rely on rcu_read_lock + * above to prevent kfree_rcu from freeing ctx before we return. */ state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED); if (state == BPF_TW_FREED) @@ -4270,7 +4423,7 @@ static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *t if (ctx) return ctx; - ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_ctx)); + ctx = bpf_map_kmalloc_nolock(map, sizeof(*ctx), 0, NUMA_NO_NODE); if (!ctx) return ERR_PTR(-ENOMEM); @@ -4284,7 +4437,7 @@ static struct bpf_task_work_ctx *bpf_task_work_fetch_ctx(struct bpf_task_work *t * tw->ctx is set by concurrent BPF program, release allocated * memory and try to reuse already set context. */ - bpf_mem_free(&bpf_global_ma, ctx); + kfree_nolock(ctx); return old_ctx; } @@ -4296,13 +4449,23 @@ static struct bpf_task_work_ctx *bpf_task_work_acquire_ctx(struct bpf_task_work { struct bpf_task_work_ctx *ctx; - ctx = bpf_task_work_fetch_ctx(tw, map); - if (IS_ERR(ctx)) - return ctx; - - /* try to get ref for task_work callback to hold */ - if (!bpf_task_work_ctx_tryget(ctx)) - return ERR_PTR(-EBUSY); + /* + * Sleepable BPF programs hold rcu_read_lock_trace but not + * regular rcu_read_lock. Since kfree_rcu waits for regular + * RCU GP, the ctx can be freed while we're between reading + * the pointer and incrementing the refcount. Take regular + * rcu_read_lock to prevent kfree_rcu from freeing the ctx + * before we can tryget it. + */ + scoped_guard(rcu) { + ctx = bpf_task_work_fetch_ctx(tw, map); + if (IS_ERR(ctx)) + return ctx; + + /* try to get ref for task_work callback to hold */ + if (!bpf_task_work_ctx_tryget(ctx)) + return ERR_PTR(-EBUSY); + } if (cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) { /* lost acquiring race or map_release_uref() stole it from us, put ref and bail */ @@ -4417,7 +4580,7 @@ static int make_file_dynptr(struct file *file, u32 flags, bool may_sleep, return -EINVAL; } - state = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_dynptr_file_impl)); + state = kmalloc_nolock(sizeof(*state), 0, NUMA_NO_NODE); if (!state) { bpf_dynptr_set_null(ptr); return -ENOMEM; @@ -4449,7 +4612,7 @@ __bpf_kfunc int bpf_dynptr_file_discard(struct bpf_dynptr *dynptr) return 0; freader_cleanup(&df->freader); - bpf_mem_free(&bpf_global_ma, df); + kfree_nolock(df); bpf_dynptr_set_null(ptr); return 0; } @@ -4536,12 +4699,19 @@ BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif +BTF_ID_FLAGS(func, bpf_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_percpu_obj_new, KF_ACQUIRE | KF_RET_NULL | KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_percpu_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_percpu_obj_drop, KF_RELEASE | KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_refcount_acquire, KF_ACQUIRE | KF_RET_NULL | KF_RCU | KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU) +BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_front_impl) +BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_back_impl) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) @@ -4550,6 +4720,7 @@ BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_rbtree_add, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_rbtree_add_impl) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL) @@ -4578,6 +4749,9 @@ BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) #endif #endif +#ifdef CONFIG_S390 +BTF_ID_FLAGS(func, bpf_get_lowcore) +#endif BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 998986853c61..332e6e003f27 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -2,217 +2,119 @@ /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ #include <linux/bpf_verifier.h> +#include <linux/btf.h> #include <linux/hashtable.h> #include <linux/jhash.h> #include <linux/slab.h> +#include <linux/sort.h> -/* - * This file implements live stack slots analysis. After accumulating - * stack usage data, the analysis answers queries about whether a - * particular stack slot may be read by an instruction or any of it's - * successors. This data is consumed by the verifier states caching - * mechanism to decide which stack slots are important when looking for a - * visited state corresponding to the current state. - * - * The analysis is call chain sensitive, meaning that data is collected - * and queried for tuples (call chain, subprogram instruction index). - * Such sensitivity allows identifying if some subprogram call always - * leads to writes in the caller's stack. - * - * The basic idea is as follows: - * - As the verifier accumulates a set of visited states, the analysis instance - * accumulates a conservative estimate of stack slots that can be read - * or must be written for each visited tuple (call chain, instruction index). - * - If several states happen to visit the same instruction with the same - * call chain, stack usage information for the corresponding tuple is joined: - * - "may_read" set represents a union of all possibly read slots - * (any slot in "may_read" set might be read at or after the instruction); - * - "must_write" set represents an intersection of all possibly written slots - * (any slot in "must_write" set is guaranteed to be written by the instruction). - * - The analysis is split into two phases: - * - read and write marks accumulation; - * - read and write marks propagation. - * - The propagation phase is a textbook live variable data flow analysis: - * - * state[cc, i].live_after = U [state[cc, s].live_before for s in bpf_insn_successors(i)] - * state[cc, i].live_before = - * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read - * - * Where: - * - `U` stands for set union - * - `/` stands for set difference; - * - `cc` stands for a call chain; - * - `i` and `s` are instruction indexes; - * - * The above equations are computed for each call chain and instruction - * index until state stops changing. - * - Additionally, in order to transfer "must_write" information from a - * subprogram to call instructions invoking this subprogram, - * the "must_write_acc" set is tracked for each (cc, i) tuple. - * A set of stack slots that are guaranteed to be written by this - * instruction or any of its successors (within the subprogram). - * The equation for "must_write_acc" propagation looks as follows: - * - * state[cc, i].must_write_acc = - * ∩ [state[cc, s].must_write_acc for s in bpf_insn_successors(i)] - * U state[cc, i].must_write - * - * (An intersection of all "must_write_acc" for instruction successors - * plus all "must_write" slots for the instruction itself). - * - After the propagation phase completes for a subprogram, information from - * (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain: - * - "must_write_acc" set is intersected with the call site's "must_write" set; - * - "may_read" set is added to the call site's "may_read" set. - * - Any live stack queries must be taken after the propagation phase. - * - Accumulation and propagation phases can be entered multiple times, - * at any point in time: - * - "may_read" set only grows; - * - "must_write" set only shrinks; - * - for each visited verifier state with zero branches, all relevant - * read and write marks are already recorded by the analysis instance. - * - * Technically, the analysis is facilitated by the following data structures: - * - Call chain: for given verifier state, the call chain is a tuple of call - * instruction indexes leading to the current subprogram plus the subprogram - * entry point index. - * - Function instance: for a given call chain, for each instruction in - * the current subprogram, a mapping between instruction index and a - * set of "may_read", "must_write" and other marks accumulated for this - * instruction. - * - A hash table mapping call chains to function instances. - */ - -struct callchain { - u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */ - /* cached subprog_info[*].start for functions owning the frames: - * - sp_starts[curframe] used to get insn relative index within current function; - * - sp_starts[0..current-1] used for fast callchain_frame_up(). - */ - u32 sp_starts[MAX_CALL_FRAMES]; - u32 curframe; /* depth of callsites and sp_starts arrays */ -}; +#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) struct per_frame_masks { - u64 may_read; /* stack slots that may be read by this instruction */ - u64 must_write; /* stack slots written by this instruction */ - u64 must_write_acc; /* stack slots written by this instruction and its successors */ - u64 live_before; /* stack slots that may be read by this insn and its successors */ + spis_t may_read; /* stack slots that may be read by this instruction */ + spis_t must_write; /* stack slots written by this instruction */ + spis_t live_before; /* stack slots that may be read by this insn and its successors */ }; /* - * A function instance created for a specific callchain. + * A function instance keyed by (callsite, depth). * Encapsulates read and write marks for each instruction in the function. - * Marks are tracked for each frame in the callchain. + * Marks are tracked for each frame up to @depth. */ struct func_instance { struct hlist_node hl_node; - struct callchain callchain; + u32 callsite; /* call insn that invoked this subprog (subprog_start for depth 0) */ + u32 depth; /* call depth (0 = entry subprog) */ + u32 subprog; /* subprog index */ + u32 subprog_start; /* cached env->subprog_info[subprog].start */ u32 insn_cnt; /* cached number of insns in the function */ - bool updated; - bool must_write_dropped; /* Per frame, per instruction masks, frames allocated lazily. */ struct per_frame_masks *frames[MAX_CALL_FRAMES]; - /* For each instruction a flag telling if "must_write" had been initialized for it. */ - bool *must_write_set; + bool must_write_initialized; }; struct live_stack_query { struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */ + u32 callsites[MAX_CALL_FRAMES]; /* callsite[i] = insn calling frame i+1 */ u32 curframe; u32 insn_idx; }; struct bpf_liveness { - DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */ + DECLARE_HASHTABLE(func_instances, 8); /* maps (depth, callsite) to func_instance */ struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */ - /* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */ - struct func_instance *cur_instance; - /* - * Below fields are used to accumulate stack write marks for instruction at - * @write_insn_idx before submitting the marks to @cur_instance. - */ - u64 write_masks_acc[MAX_CALL_FRAMES]; - u32 write_insn_idx; + u32 subprog_calls; /* analyze_subprog() invocations */ }; -/* Compute callchain corresponding to state @st at depth @frameno */ -static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st, - struct callchain *callchain, u32 frameno) +/* + * Hash/compare key for func_instance: (depth, callsite). + * For depth == 0 (entry subprog), @callsite is the subprog start insn. + * For depth > 0, @callsite is the call instruction index that invoked the subprog. + */ +static u32 instance_hash(u32 callsite, u32 depth) { - struct bpf_subprog_info *subprog_info = env->subprog_info; - u32 i; + u32 key[2] = { depth, callsite }; - memset(callchain, 0, sizeof(*callchain)); - for (i = 0; i <= frameno; i++) { - callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start; - if (i < st->curframe) - callchain->callsites[i] = st->frame[i + 1]->callsite; - } - callchain->curframe = frameno; - callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe]; + return jhash2(key, 2, 0); } -static u32 hash_callchain(struct callchain *callchain) +static struct func_instance *find_instance(struct bpf_verifier_env *env, + u32 callsite, u32 depth) { - return jhash2(callchain->callsites, callchain->curframe, 0); -} - -static bool same_callsites(struct callchain *a, struct callchain *b) -{ - int i; + struct bpf_liveness *liveness = env->liveness; + struct func_instance *f; + u32 key = instance_hash(callsite, depth); - if (a->curframe != b->curframe) - return false; - for (i = a->curframe; i >= 0; i--) - if (a->callsites[i] != b->callsites[i]) - return false; - return true; + hash_for_each_possible(liveness->func_instances, f, hl_node, key) + if (f->depth == depth && f->callsite == callsite) + return f; + return NULL; } -/* - * Find existing or allocate new function instance corresponding to @callchain. - * Instances are accumulated in env->liveness->func_instances and persist - * until the end of the verification process. - */ -static struct func_instance *__lookup_instance(struct bpf_verifier_env *env, - struct callchain *callchain) +static struct func_instance *call_instance(struct bpf_verifier_env *env, + struct func_instance *caller, + u32 callsite, int subprog) { - struct bpf_liveness *liveness = env->liveness; - struct bpf_subprog_info *subprog; - struct func_instance *result; - u32 subprog_sz, size, key; - - key = hash_callchain(callchain); - hash_for_each_possible(liveness->func_instances, result, hl_node, key) - if (same_callsites(&result->callchain, callchain)) - return result; - - subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]); - subprog_sz = (subprog + 1)->start - subprog->start; - size = sizeof(struct func_instance); - result = kvzalloc(size, GFP_KERNEL_ACCOUNT); - if (!result) - return ERR_PTR(-ENOMEM); - result->must_write_set = kvzalloc_objs(*result->must_write_set, - subprog_sz, GFP_KERNEL_ACCOUNT); - if (!result->must_write_set) { - kvfree(result); + u32 depth = caller ? caller->depth + 1 : 0; + u32 subprog_start = env->subprog_info[subprog].start; + u32 lookup_key = depth > 0 ? callsite : subprog_start; + struct func_instance *f; + u32 hash; + + f = find_instance(env, lookup_key, depth); + if (f) + return f; + + f = kvzalloc(sizeof(*f), GFP_KERNEL_ACCOUNT); + if (!f) return ERR_PTR(-ENOMEM); - } - memcpy(&result->callchain, callchain, sizeof(*callchain)); - result->insn_cnt = subprog_sz; - hash_add(liveness->func_instances, &result->hl_node, key); - return result; + f->callsite = lookup_key; + f->depth = depth; + f->subprog = subprog; + f->subprog_start = subprog_start; + f->insn_cnt = (env->subprog_info + subprog + 1)->start - subprog_start; + hash = instance_hash(lookup_key, depth); + hash_add(env->liveness->func_instances, &f->hl_node, hash); + return f; } static struct func_instance *lookup_instance(struct bpf_verifier_env *env, struct bpf_verifier_state *st, u32 frameno) { - struct callchain callchain; - - compute_callchain(env, st, &callchain, frameno); - return __lookup_instance(env, &callchain); + u32 callsite, subprog_start; + struct func_instance *f; + u32 key, depth; + + subprog_start = env->subprog_info[st->frame[frameno]->subprogno].start; + callsite = frameno > 0 ? st->frame[frameno]->callsite : subprog_start; + + for (depth = frameno; ; depth--) { + key = depth > 0 ? callsite : subprog_start; + f = find_instance(env, key, depth); + if (f || depth == 0) + return f; + } } int bpf_stack_liveness_init(struct bpf_verifier_env *env) @@ -233,9 +135,8 @@ void bpf_stack_liveness_free(struct bpf_verifier_env *env) if (!env->liveness) return; hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) { - for (i = 0; i <= instance->callchain.curframe; i++) + for (i = 0; i <= instance->depth; i++) kvfree(instance->frames[i]); - kvfree(instance->must_write_set); kvfree(instance); } kvfree(env->liveness); @@ -247,7 +148,7 @@ void bpf_stack_liveness_free(struct bpf_verifier_env *env) */ static int relative_idx(struct func_instance *instance, u32 insn_idx) { - return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe]; + return insn_idx - instance->subprog_start; } static struct per_frame_masks *get_frame_masks(struct func_instance *instance, @@ -259,8 +160,7 @@ static struct per_frame_masks *get_frame_masks(struct func_instance *instance, return &instance->frames[frame][relative_idx(instance, insn_idx)]; } -static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env, - struct func_instance *instance, +static struct per_frame_masks *alloc_frame_masks(struct func_instance *instance, u32 frame, u32 insn_idx) { struct per_frame_masks *arr; @@ -275,167 +175,29 @@ static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env, return get_frame_masks(instance, frame, insn_idx); } -void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env) -{ - env->liveness->cur_instance = NULL; -} - -/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */ -static int ensure_cur_instance(struct bpf_verifier_env *env) -{ - struct bpf_liveness *liveness = env->liveness; - struct func_instance *instance; - - if (liveness->cur_instance) - return 0; - - instance = lookup_instance(env, env->cur_state, env->cur_state->curframe); - if (IS_ERR(instance)) - return PTR_ERR(instance); - - liveness->cur_instance = instance; - return 0; -} - /* Accumulate may_read masks for @frame at @insn_idx */ -static int mark_stack_read(struct bpf_verifier_env *env, - struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask) +static int mark_stack_read(struct func_instance *instance, u32 frame, u32 insn_idx, spis_t mask) { struct per_frame_masks *masks; - u64 new_may_read; - masks = alloc_frame_masks(env, instance, frame, insn_idx); + masks = alloc_frame_masks(instance, frame, insn_idx); if (IS_ERR(masks)) return PTR_ERR(masks); - new_may_read = masks->may_read | mask; - if (new_may_read != masks->may_read && - ((new_may_read | masks->live_before) != masks->live_before)) - instance->updated = true; - masks->may_read |= mask; - return 0; -} - -int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask) -{ - int err; - - err = ensure_cur_instance(env); - err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask); - return err; -} - -static void reset_stack_write_marks(struct bpf_verifier_env *env, - struct func_instance *instance, u32 insn_idx) -{ - struct bpf_liveness *liveness = env->liveness; - int i; - - liveness->write_insn_idx = insn_idx; - for (i = 0; i <= instance->callchain.curframe; i++) - liveness->write_masks_acc[i] = 0; -} - -int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx) -{ - struct bpf_liveness *liveness = env->liveness; - int err; - - err = ensure_cur_instance(env); - if (err) - return err; - - reset_stack_write_marks(env, liveness->cur_instance, insn_idx); + masks->may_read = spis_or(masks->may_read, mask); return 0; } -void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask) -{ - env->liveness->write_masks_acc[frame] |= mask; -} - -static int commit_stack_write_marks(struct bpf_verifier_env *env, - struct func_instance *instance) +static int mark_stack_write(struct func_instance *instance, u32 frame, u32 insn_idx, spis_t mask) { - struct bpf_liveness *liveness = env->liveness; - u32 idx, frame, curframe, old_must_write; struct per_frame_masks *masks; - u64 mask; - - if (!instance) - return 0; - curframe = instance->callchain.curframe; - idx = relative_idx(instance, liveness->write_insn_idx); - for (frame = 0; frame <= curframe; frame++) { - mask = liveness->write_masks_acc[frame]; - /* avoid allocating frames for zero masks */ - if (mask == 0 && !instance->must_write_set[idx]) - continue; - masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx); - if (IS_ERR(masks)) - return PTR_ERR(masks); - old_must_write = masks->must_write; - /* - * If instruction at this callchain is seen for a first time, set must_write equal - * to @mask. Otherwise take intersection with the previous value. - */ - if (instance->must_write_set[idx]) - mask &= old_must_write; - if (old_must_write != mask) { - masks->must_write = mask; - instance->updated = true; - } - if (old_must_write & ~mask) - instance->must_write_dropped = true; - } - instance->must_write_set[idx] = true; - liveness->write_insn_idx = 0; + masks = alloc_frame_masks(instance, frame, insn_idx); + if (IS_ERR(masks)) + return PTR_ERR(masks); + masks->must_write = spis_or(masks->must_write, mask); return 0; } -/* - * Merge stack writes marks in @env->liveness->write_masks_acc - * with information already in @env->liveness->cur_instance. - */ -int bpf_commit_stack_write_marks(struct bpf_verifier_env *env) -{ - return commit_stack_write_marks(env, env->liveness->cur_instance); -} - -static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain) -{ - char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf); - char *buf = env->tmp_str_buf; - int i; - - buf += snprintf(buf, buf_end - buf, "("); - for (i = 0; i <= callchain->curframe; i++) - buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]); - snprintf(buf, buf_end - buf, ")"); - return env->tmp_str_buf; -} - -static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain, - char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new) -{ - u64 changed_bits = old ^ new; - u64 new_ones = new & changed_bits; - u64 new_zeros = ~new & changed_bits; - - if (!changed_bits) - return; - bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx); - if (new_ones) { - bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones); - bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf); - } - if (new_zeros) { - bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros); - bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf); - } - bpf_log(&env->log, "\n"); -} - int bpf_jmp_offset(struct bpf_insn *insn) { u8 code = insn->code; @@ -507,62 +269,11 @@ bpf_insn_successors(struct bpf_verifier_env *env, u32 idx) __diag_pop(); -static struct func_instance *get_outer_instance(struct bpf_verifier_env *env, - struct func_instance *instance) -{ - struct callchain callchain = instance->callchain; - - /* Adjust @callchain to represent callchain one frame up */ - callchain.callsites[callchain.curframe] = 0; - callchain.sp_starts[callchain.curframe] = 0; - callchain.curframe--; - callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe]; - return __lookup_instance(env, &callchain); -} - -static u32 callchain_subprog_start(struct callchain *callchain) -{ - return callchain->sp_starts[callchain->curframe]; -} - -/* - * Transfer @may_read and @must_write_acc marks from the first instruction of @instance, - * to the call instruction in function instance calling @instance. - */ -static int propagate_to_outer_instance(struct bpf_verifier_env *env, - struct func_instance *instance) -{ - struct callchain *callchain = &instance->callchain; - u32 this_subprog_start, callsite, frame; - struct func_instance *outer_instance; - struct per_frame_masks *insn; - int err; - - this_subprog_start = callchain_subprog_start(callchain); - outer_instance = get_outer_instance(env, instance); - if (IS_ERR(outer_instance)) - return PTR_ERR(outer_instance); - callsite = callchain->callsites[callchain->curframe - 1]; - - reset_stack_write_marks(env, outer_instance, callsite); - for (frame = 0; frame < callchain->curframe; frame++) { - insn = get_frame_masks(instance, frame, this_subprog_start); - if (!insn) - continue; - bpf_mark_stack_write(env, frame, insn->must_write_acc); - err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before); - if (err) - return err; - } - commit_stack_write_marks(env, outer_instance); - return 0; -} static inline bool update_insn(struct bpf_verifier_env *env, struct func_instance *instance, u32 frame, u32 insn_idx) { - struct bpf_insn_aux_data *aux = env->insn_aux_data; - u64 new_before, new_after, must_write_acc; + spis_t new_before, new_after; struct per_frame_masks *insn, *succ_insn; struct bpf_iarray *succ; u32 s; @@ -574,77 +285,40 @@ static inline bool update_insn(struct bpf_verifier_env *env, changed = false; insn = get_frame_masks(instance, frame, insn_idx); - new_before = 0; - new_after = 0; - /* - * New "must_write_acc" is an intersection of all "must_write_acc" - * of successors plus all "must_write" slots of instruction itself. - */ - must_write_acc = U64_MAX; + new_before = SPIS_ZERO; + new_after = SPIS_ZERO; for (s = 0; s < succ->cnt; ++s) { succ_insn = get_frame_masks(instance, frame, succ->items[s]); - new_after |= succ_insn->live_before; - must_write_acc &= succ_insn->must_write_acc; + new_after = spis_or(new_after, succ_insn->live_before); } - must_write_acc |= insn->must_write; /* * New "live_before" is a union of all "live_before" of successors * minus slots written by instruction plus slots read by instruction. + * new_before = (new_after & ~insn->must_write) | insn->may_read */ - new_before = (new_after & ~insn->must_write) | insn->may_read; - changed |= new_before != insn->live_before; - changed |= must_write_acc != insn->must_write_acc; - if (unlikely(env->log.level & BPF_LOG_LEVEL2) && - (insn->may_read || insn->must_write || - insn_idx == callchain_subprog_start(&instance->callchain) || - aux[insn_idx].prune_point)) { - log_mask_change(env, &instance->callchain, "live", - frame, insn_idx, insn->live_before, new_before); - log_mask_change(env, &instance->callchain, "written", - frame, insn_idx, insn->must_write_acc, must_write_acc); - } + new_before = spis_or(spis_and(new_after, spis_not(insn->must_write)), + insn->may_read); + changed |= !spis_equal(new_before, insn->live_before); insn->live_before = new_before; - insn->must_write_acc = must_write_acc; return changed; } -/* Fixed-point computation of @live_before and @must_write_acc marks */ -static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance) +/* Fixed-point computation of @live_before marks */ +static void update_instance(struct bpf_verifier_env *env, struct func_instance *instance) { - u32 i, frame, po_start, po_end, cnt, this_subprog_start; - struct callchain *callchain = &instance->callchain; + u32 i, frame, po_start, po_end; int *insn_postorder = env->cfg.insn_postorder; struct bpf_subprog_info *subprog; - struct per_frame_masks *insn; bool changed; - int err; - this_subprog_start = callchain_subprog_start(callchain); - /* - * If must_write marks were updated must_write_acc needs to be reset - * (to account for the case when new must_write sets became smaller). - */ - if (instance->must_write_dropped) { - for (frame = 0; frame <= callchain->curframe; frame++) { - if (!instance->frames[frame]) - continue; - - for (i = 0; i < instance->insn_cnt; i++) { - insn = get_frame_masks(instance, frame, this_subprog_start + i); - insn->must_write_acc = 0; - } - } - } - - subprog = bpf_find_containing_subprog(env, this_subprog_start); + instance->must_write_initialized = true; + subprog = &env->subprog_info[instance->subprog]; po_start = subprog->postorder_start; po_end = (subprog + 1)->postorder_start; - cnt = 0; /* repeat until fixed point is reached */ do { - cnt++; changed = false; - for (frame = 0; frame <= instance->callchain.curframe; frame++) { + for (frame = 0; frame <= instance->depth; frame++) { if (!instance->frames[frame]) continue; @@ -652,57 +326,14 @@ static int update_instance(struct bpf_verifier_env *env, struct func_instance *i changed |= update_insn(env, instance, frame, insn_postorder[i]); } } while (changed); - - if (env->log.level & BPF_LOG_LEVEL2) - bpf_log(&env->log, "%s live stack update done in %d iterations\n", - fmt_callchain(env, callchain), cnt); - - /* transfer marks accumulated for outer frames to outer func instance (caller) */ - if (callchain->curframe > 0) { - err = propagate_to_outer_instance(env, instance); - if (err) - return err; - } - - return 0; -} - -/* - * Prepare all callchains within @env->cur_state for querying. - * This function should be called after each verifier.c:pop_stack() - * and whenever verifier.c:do_check_insn() processes subprogram exit. - * This would guarantee that visited verifier states with zero branches - * have their bpf_mark_stack_{read,write}() effects propagated in - * @env->liveness. - */ -int bpf_update_live_stack(struct bpf_verifier_env *env) -{ - struct func_instance *instance; - int err, frame; - - bpf_reset_live_stack_callchain(env); - for (frame = env->cur_state->curframe; frame >= 0; --frame) { - instance = lookup_instance(env, env->cur_state, frame); - if (IS_ERR(instance)) - return PTR_ERR(instance); - - if (instance->updated) { - err = update_instance(env, instance); - if (err) - return err; - instance->updated = false; - instance->must_write_dropped = false; - } - } - return 0; } -static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi) +static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 half_spi) { struct per_frame_masks *masks; masks = get_frame_masks(instance, frameno, insn_idx); - return masks && (masks->live_before & BIT(spi)); + return masks && spis_test_bit(masks->live_before, half_spi); } int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st) @@ -714,41 +345,1868 @@ int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_ memset(q, 0, sizeof(*q)); for (frame = 0; frame <= st->curframe; frame++) { instance = lookup_instance(env, st, frame); - if (IS_ERR(instance)) - return PTR_ERR(instance); - q->instances[frame] = instance; + if (IS_ERR_OR_NULL(instance)) + q->instances[frame] = NULL; + else + q->instances[frame] = instance; + if (frame < st->curframe) + q->callsites[frame] = st->frame[frame + 1]->callsite; } q->curframe = st->curframe; q->insn_idx = st->insn_idx; return 0; } -bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi) +bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 half_spi) { /* - * Slot is alive if it is read before q->st->insn_idx in current func instance, + * Slot is alive if it is read before q->insn_idx in current func instance, * or if for some outer func instance: * - alive before callsite if callsite calls callback, otherwise * - alive after callsite */ struct live_stack_query *q = &env->liveness->live_stack_query; struct func_instance *instance, *curframe_instance; - u32 i, callsite; - bool alive; + u32 i, callsite, rel; + int cur_delta, delta; + bool alive = false; curframe_instance = q->instances[q->curframe]; - if (is_live_before(curframe_instance, q->insn_idx, frameno, spi)) + if (!curframe_instance) + return true; + cur_delta = (int)curframe_instance->depth - (int)q->curframe; + rel = frameno + cur_delta; + if (rel <= curframe_instance->depth) + alive = is_live_before(curframe_instance, q->insn_idx, rel, half_spi); + + if (alive) return true; for (i = frameno; i < q->curframe; i++) { - callsite = curframe_instance->callchain.callsites[i]; instance = q->instances[i]; + if (!instance) + return true; + /* Map actual frameno to frame index within this instance */ + delta = (int)instance->depth - (int)i; + rel = frameno + delta; + if (rel > instance->depth) + return true; + + /* Get callsite from verifier state, not from instance callchain */ + callsite = q->callsites[i]; + alive = bpf_calls_callback(env, callsite) - ? is_live_before(instance, callsite, frameno, spi) - : is_live_before(instance, callsite + 1, frameno, spi); + ? is_live_before(instance, callsite, rel, half_spi) + : is_live_before(instance, callsite + 1, rel, half_spi); if (alive) return true; } return false; } + +static char *fmt_subprog(struct bpf_verifier_env *env, int subprog) +{ + const char *name = env->subprog_info[subprog].name; + + snprintf(env->tmp_str_buf, sizeof(env->tmp_str_buf), + "subprog#%d%s%s", subprog, name ? " " : "", name ? name : ""); + return env->tmp_str_buf; +} + +static char *fmt_instance(struct bpf_verifier_env *env, struct func_instance *instance) +{ + snprintf(env->tmp_str_buf, sizeof(env->tmp_str_buf), + "(d%d,cs%d)", instance->depth, instance->callsite); + return env->tmp_str_buf; +} + +static int spi_off(int spi) +{ + return -(spi + 1) * BPF_REG_SIZE; +} + +/* + * When both halves of an 8-byte SPI are set, print as "-8","-16",... + * When only one half is set, print as "-4h","-8h",... + * Runs of 3+ consecutive fully-set SPIs are collapsed: "fp0-8..-24" + */ +static char *fmt_spis_mask(struct bpf_verifier_env *env, int frame, bool first, spis_t spis) +{ + int buf_sz = sizeof(env->tmp_str_buf); + char *buf = env->tmp_str_buf; + int spi, n, run_start; + + buf[0] = '\0'; + + for (spi = 0; spi < STACK_SLOTS / 2 && buf_sz > 0; spi++) { + bool lo = spis_test_bit(spis, spi * 2); + bool hi = spis_test_bit(spis, spi * 2 + 1); + const char *space = first ? "" : " "; + + if (!lo && !hi) + continue; + + if (!lo || !hi) { + /* half-spi */ + n = scnprintf(buf, buf_sz, "%sfp%d%d%s", + space, frame, spi_off(spi) + (lo ? STACK_SLOT_SZ : 0), "h"); + } else if (spi + 2 < STACK_SLOTS / 2 && + spis_test_bit(spis, spi * 2 + 2) && + spis_test_bit(spis, spi * 2 + 3) && + spis_test_bit(spis, spi * 2 + 4) && + spis_test_bit(spis, spi * 2 + 5)) { + /* 3+ consecutive full spis */ + run_start = spi; + while (spi + 1 < STACK_SLOTS / 2 && + spis_test_bit(spis, (spi + 1) * 2) && + spis_test_bit(spis, (spi + 1) * 2 + 1)) + spi++; + n = scnprintf(buf, buf_sz, "%sfp%d%d..%d", + space, frame, spi_off(run_start), spi_off(spi)); + } else { + /* just a full spi */ + n = scnprintf(buf, buf_sz, "%sfp%d%d", space, frame, spi_off(spi)); + } + first = false; + buf += n; + buf_sz -= n; + } + return env->tmp_str_buf; +} + +static void print_instance(struct bpf_verifier_env *env, struct func_instance *instance) +{ + int start = env->subprog_info[instance->subprog].start; + struct bpf_insn *insns = env->prog->insnsi; + struct per_frame_masks *masks; + int len = instance->insn_cnt; + int insn_idx, frame, i; + bool has_use, has_def; + u64 pos, insn_pos; + + if (!(env->log.level & BPF_LOG_LEVEL2)) + return; + + verbose(env, "stack use/def %s ", fmt_subprog(env, instance->subprog)); + verbose(env, "%s:\n", fmt_instance(env, instance)); + for (i = 0; i < len; i++) { + insn_idx = start + i; + has_use = false; + has_def = false; + pos = env->log.end_pos; + verbose(env, "%3d: ", insn_idx); + bpf_verbose_insn(env, &insns[insn_idx]); + bpf_vlog_reset(&env->log, env->log.end_pos - 1); /* remove \n */ + insn_pos = env->log.end_pos; + verbose(env, "%*c;", bpf_vlog_alignment(insn_pos - pos), ' '); + pos = env->log.end_pos; + verbose(env, " use: "); + for (frame = instance->depth; frame >= 0; --frame) { + masks = get_frame_masks(instance, frame, insn_idx); + if (!masks || spis_is_zero(masks->may_read)) + continue; + verbose(env, "%s", fmt_spis_mask(env, frame, !has_use, masks->may_read)); + has_use = true; + } + if (!has_use) + bpf_vlog_reset(&env->log, pos); + pos = env->log.end_pos; + verbose(env, " def: "); + for (frame = instance->depth; frame >= 0; --frame) { + masks = get_frame_masks(instance, frame, insn_idx); + if (!masks || spis_is_zero(masks->must_write)) + continue; + verbose(env, "%s", fmt_spis_mask(env, frame, !has_def, masks->must_write)); + has_def = true; + } + if (!has_def) + bpf_vlog_reset(&env->log, has_use ? pos : insn_pos); + verbose(env, "\n"); + if (bpf_is_ldimm64(&insns[insn_idx])) + i++; + } +} + +static int cmp_instances(const void *pa, const void *pb) +{ + struct func_instance *a = *(struct func_instance **)pa; + struct func_instance *b = *(struct func_instance **)pb; + int dcallsite = (int)a->callsite - b->callsite; + int ddepth = (int)a->depth - b->depth; + + if (dcallsite) + return dcallsite; + if (ddepth) + return ddepth; + return 0; +} + +/* print use/def slots for all instances ordered by callsite first, then by depth */ +static int print_instances(struct bpf_verifier_env *env) +{ + struct func_instance *instance, **sorted_instances; + struct bpf_liveness *liveness = env->liveness; + int i, bkt, cnt; + + cnt = 0; + hash_for_each(liveness->func_instances, bkt, instance, hl_node) + cnt++; + sorted_instances = kvmalloc_objs(*sorted_instances, cnt, GFP_KERNEL_ACCOUNT); + if (!sorted_instances) + return -ENOMEM; + cnt = 0; + hash_for_each(liveness->func_instances, bkt, instance, hl_node) + sorted_instances[cnt++] = instance; + sort(sorted_instances, cnt, sizeof(*sorted_instances), cmp_instances, NULL); + for (i = 0; i < cnt; i++) + print_instance(env, sorted_instances[i]); + kvfree(sorted_instances); + return 0; +} + +/* + * Per-register tracking state for compute_subprog_args(). + * Tracks which frame's FP a value is derived from + * and the byte offset from that frame's FP. + * + * The .frame field forms a lattice with three levels of precision: + * + * precise {frame=N, off=V} -- known absolute frame index and byte offset + * | + * offset-imprecise {frame=N, cnt=0} + * | -- known frame identity, unknown offset + * fully-imprecise {frame=ARG_IMPRECISE, mask=bitmask} + * -- unknown frame identity; .mask is a + * bitmask of which frame indices might be + * involved + * + * At CFG merge points, arg_track_join() moves down the lattice: + * - same frame + same offset -> precise + * - same frame + different offset -> offset-imprecise + * - different frames -> fully-imprecise (bitmask OR) + * + * At memory access sites (LDX/STX/ST), offset-imprecise marks only + * the known frame's access mask as SPIS_ALL, while fully-imprecise + * iterates bits in the bitmask and routes each frame to its target. + */ +#define MAX_ARG_OFFSETS 4 + +struct arg_track { + union { + s16 off[MAX_ARG_OFFSETS]; /* byte offsets; off_cnt says how many */ + u16 mask; /* arg bitmask when arg == ARG_IMPRECISE */ + }; + s8 frame; /* absolute frame index, or enum arg_track_state */ + s8 off_cnt; /* 0 = offset-imprecise, 1-4 = # of precise offsets */ +}; + +enum arg_track_state { + ARG_NONE = -1, /* not derived from any argument */ + ARG_UNVISITED = -2, /* not yet reached by dataflow */ + ARG_IMPRECISE = -3, /* lost identity; .mask is arg bitmask */ +}; + +/* Track callee stack slots fp-8 through fp-512 (64 slots of 8 bytes each) */ +#define MAX_ARG_SPILL_SLOTS 64 + +static bool arg_is_visited(const struct arg_track *at) +{ + return at->frame != ARG_UNVISITED; +} + +static bool arg_is_fp(const struct arg_track *at) +{ + return at->frame >= 0 || at->frame == ARG_IMPRECISE; +} + +static void verbose_arg_track(struct bpf_verifier_env *env, struct arg_track *at) +{ + int i; + + switch (at->frame) { + case ARG_NONE: verbose(env, "_"); break; + case ARG_UNVISITED: verbose(env, "?"); break; + case ARG_IMPRECISE: verbose(env, "IMP%x", at->mask); break; + default: + /* frame >= 0: absolute frame index */ + if (at->off_cnt == 0) { + verbose(env, "fp%d ?", at->frame); + } else { + for (i = 0; i < at->off_cnt; i++) { + if (i) + verbose(env, "|"); + verbose(env, "fp%d%+d", at->frame, at->off[i]); + } + } + break; + } +} + +static bool arg_track_eq(const struct arg_track *a, const struct arg_track *b) +{ + int i; + + if (a->frame != b->frame) + return false; + if (a->frame == ARG_IMPRECISE) + return a->mask == b->mask; + if (a->frame < 0) + return true; + if (a->off_cnt != b->off_cnt) + return false; + for (i = 0; i < a->off_cnt; i++) + if (a->off[i] != b->off[i]) + return false; + return true; +} + +static struct arg_track arg_single(s8 arg, s16 off) +{ + struct arg_track at = {}; + + at.frame = arg; + at.off[0] = off; + at.off_cnt = 1; + return at; +} + +/* + * Merge two sorted offset arrays, deduplicate. + * Returns off_cnt=0 if the result exceeds MAX_ARG_OFFSETS. + * Both args must have the same frame and off_cnt > 0. + */ +static struct arg_track arg_merge_offsets(struct arg_track a, struct arg_track b) +{ + struct arg_track result = { .frame = a.frame }; + struct arg_track imp = { .frame = a.frame }; + int i = 0, j = 0, k = 0; + + while (i < a.off_cnt && j < b.off_cnt) { + s16 v; + + if (a.off[i] <= b.off[j]) { + v = a.off[i++]; + if (v == b.off[j]) + j++; + } else { + v = b.off[j++]; + } + if (k > 0 && result.off[k - 1] == v) + continue; + if (k >= MAX_ARG_OFFSETS) + return imp; + result.off[k++] = v; + } + while (i < a.off_cnt) { + if (k >= MAX_ARG_OFFSETS) + return imp; + result.off[k++] = a.off[i++]; + } + while (j < b.off_cnt) { + if (k >= MAX_ARG_OFFSETS) + return imp; + result.off[k++] = b.off[j++]; + } + result.off_cnt = k; + return result; +} + +/* + * Merge two arg_tracks into ARG_IMPRECISE, collecting the frame + * bits from both operands. Precise frame indices (frame >= 0) + * contribute a single bit; existing ARG_IMPRECISE values + * contribute their full bitmask. + */ +static struct arg_track arg_join_imprecise(struct arg_track a, struct arg_track b) +{ + u32 m = 0; + + if (a.frame >= 0) + m |= BIT(a.frame); + else if (a.frame == ARG_IMPRECISE) + m |= a.mask; + + if (b.frame >= 0) + m |= BIT(b.frame); + else if (b.frame == ARG_IMPRECISE) + m |= b.mask; + + return (struct arg_track){ .mask = m, .frame = ARG_IMPRECISE }; +} + +/* Join two arg_track values at merge points */ +static struct arg_track __arg_track_join(struct arg_track a, struct arg_track b) +{ + if (!arg_is_visited(&b)) + return a; + if (!arg_is_visited(&a)) + return b; + if (a.frame == b.frame && a.frame >= 0) { + /* Both offset-imprecise: stay imprecise */ + if (a.off_cnt == 0 || b.off_cnt == 0) + return (struct arg_track){ .frame = a.frame }; + /* Merge offset sets; falls back to off_cnt=0 if >4 */ + return arg_merge_offsets(a, b); + } + + /* + * args are different, but one of them is known + * arg + none -> arg + * none + arg -> arg + * + * none + none -> none + */ + if (a.frame == ARG_NONE && b.frame == ARG_NONE) + return a; + if (a.frame >= 0 && b.frame == ARG_NONE) { + /* + * When joining single fp-N add fake fp+0 to + * keep stack_use and prevent stack_def + */ + if (a.off_cnt == 1) + return arg_merge_offsets(a, arg_single(a.frame, 0)); + return a; + } + if (b.frame >= 0 && a.frame == ARG_NONE) { + if (b.off_cnt == 1) + return arg_merge_offsets(b, arg_single(b.frame, 0)); + return b; + } + + return arg_join_imprecise(a, b); +} + +static bool arg_track_join(struct bpf_verifier_env *env, int idx, int target, int r, + struct arg_track *in, struct arg_track out) +{ + struct arg_track old = *in; + struct arg_track new_val = __arg_track_join(old, out); + + if (arg_track_eq(&new_val, &old)) + return false; + + *in = new_val; + if (!(env->log.level & BPF_LOG_LEVEL2) || !arg_is_visited(&old)) + return true; + + verbose(env, "arg JOIN insn %d -> %d ", idx, target); + if (r >= 0) + verbose(env, "r%d: ", r); + else + verbose(env, "fp%+d: ", r * 8); + verbose_arg_track(env, &old); + verbose(env, " + "); + verbose_arg_track(env, &out); + verbose(env, " => "); + verbose_arg_track(env, &new_val); + verbose(env, "\n"); + return true; +} + +/* + * Compute the result when an ALU op destroys offset precision. + * If a single arg is identifiable, preserve it with OFF_IMPRECISE. + * If two different args are involved or one is already ARG_IMPRECISE, + * the result is fully ARG_IMPRECISE. + */ +static void arg_track_alu64(struct arg_track *dst, const struct arg_track *src) +{ + WARN_ON_ONCE(!arg_is_visited(dst)); + WARN_ON_ONCE(!arg_is_visited(src)); + + if (dst->frame >= 0 && (src->frame == ARG_NONE || src->frame == dst->frame)) { + /* + * rX += rY where rY is not arg derived + * rX += rX + */ + dst->off_cnt = 0; + return; + } + if (src->frame >= 0 && dst->frame == ARG_NONE) { + /* + * rX += rY where rX is not arg derived + * rY identity leaks into rX + */ + dst->off_cnt = 0; + dst->frame = src->frame; + return; + } + + if (dst->frame == ARG_NONE && src->frame == ARG_NONE) + return; + + *dst = arg_join_imprecise(*dst, *src); +} + +static bool arg_add(s16 off, s64 delta, s16 *out) +{ + s16 d = delta; + + if (d != delta) + return true; + return check_add_overflow(off, d, out); +} + +static void arg_padd(struct arg_track *at, s64 delta) +{ + int i; + + if (at->off_cnt == 0) + return; + for (i = 0; i < at->off_cnt; i++) { + s16 new_off; + + if (arg_add(at->off[i], delta, &new_off)) { + at->off_cnt = 0; + return; + } + at->off[i] = new_off; + } +} + +/* + * Convert a byte offset from FP to a callee stack slot index. + * Returns -1 if out of range or not 8-byte aligned. + * Slot 0 = fp-8, slot 1 = fp-16, ..., slot 7 = fp-64, .... + */ +static int fp_off_to_slot(s16 off) +{ + if (off >= 0 || off < -(int)(MAX_ARG_SPILL_SLOTS * 8)) + return -1; + if (off % 8) + return -1; + return (-off) / 8 - 1; +} + +static struct arg_track fill_from_stack(struct bpf_insn *insn, + struct arg_track *at_out, int reg, + struct arg_track *at_stack_out, + int depth) +{ + struct arg_track imp = { + .mask = (1u << (depth + 1)) - 1, + .frame = ARG_IMPRECISE + }; + struct arg_track result = { .frame = ARG_NONE }; + int cnt, i; + + if (reg == BPF_REG_FP) { + int slot = fp_off_to_slot(insn->off); + + return slot >= 0 ? at_stack_out[slot] : imp; + } + cnt = at_out[reg].off_cnt; + if (cnt == 0) + return imp; + + for (i = 0; i < cnt; i++) { + s16 fp_off, slot; + + if (arg_add(at_out[reg].off[i], insn->off, &fp_off)) + return imp; + slot = fp_off_to_slot(fp_off); + if (slot < 0) + return imp; + result = __arg_track_join(result, at_stack_out[slot]); + } + return result; +} + +/* + * Spill @val to all possible stack slots indicated by the FP offsets in @reg. + * For an 8-byte store, single candidate slot gets @val. multi-slots are joined. + * sub-8-byte store joins with ARG_NONE. + * When exact offset is unknown conservatively add reg values to all slots in at_stack_out. + */ +static void spill_to_stack(struct bpf_insn *insn, struct arg_track *at_out, + int reg, struct arg_track *at_stack_out, + struct arg_track *val, u32 sz) +{ + struct arg_track none = { .frame = ARG_NONE }; + struct arg_track new_val = sz == 8 ? *val : none; + int cnt, i; + + if (reg == BPF_REG_FP) { + int slot = fp_off_to_slot(insn->off); + + if (slot >= 0) + at_stack_out[slot] = new_val; + return; + } + cnt = at_out[reg].off_cnt; + if (cnt == 0) { + for (int slot = 0; slot < MAX_ARG_SPILL_SLOTS; slot++) + at_stack_out[slot] = __arg_track_join(at_stack_out[slot], new_val); + return; + } + for (i = 0; i < cnt; i++) { + s16 fp_off; + int slot; + + if (arg_add(at_out[reg].off[i], insn->off, &fp_off)) + continue; + slot = fp_off_to_slot(fp_off); + if (slot < 0) + continue; + if (cnt == 1) + at_stack_out[slot] = new_val; + else + at_stack_out[slot] = __arg_track_join(at_stack_out[slot], new_val); + } +} + +/* + * Clear all tracked callee stack slots overlapping the byte range + * [off, off+sz-1] where off is a negative FP-relative offset. + */ +static void clear_overlapping_stack_slots(struct arg_track *at_stack, s16 off, u32 sz, int cnt) +{ + struct arg_track none = { .frame = ARG_NONE }; + + if (cnt == 0) { + for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++) + at_stack[i] = __arg_track_join(at_stack[i], none); + return; + } + for (int i = 0; i < MAX_ARG_SPILL_SLOTS; i++) { + int slot_start = -((i + 1) * 8); + int slot_end = slot_start + 8; + + if (slot_start < off + (int)sz && slot_end > off) { + if (cnt == 1) + at_stack[i] = none; + else + at_stack[i] = __arg_track_join(at_stack[i], none); + } + } +} + +/* + * Clear stack slots overlapping all possible FP offsets in @reg. + */ +static void clear_stack_for_all_offs(struct bpf_insn *insn, + struct arg_track *at_out, int reg, + struct arg_track *at_stack_out, u32 sz) +{ + int cnt, i; + + if (reg == BPF_REG_FP) { + clear_overlapping_stack_slots(at_stack_out, insn->off, sz, 1); + return; + } + cnt = at_out[reg].off_cnt; + if (cnt == 0) { + clear_overlapping_stack_slots(at_stack_out, 0, sz, cnt); + return; + } + for (i = 0; i < cnt; i++) { + s16 fp_off; + + if (arg_add(at_out[reg].off[i], insn->off, &fp_off)) { + clear_overlapping_stack_slots(at_stack_out, 0, sz, 0); + break; + } + clear_overlapping_stack_slots(at_stack_out, fp_off, sz, cnt); + } +} + +static void arg_track_log(struct bpf_verifier_env *env, struct bpf_insn *insn, int idx, + struct arg_track *at_in, struct arg_track *at_stack_in, + struct arg_track *at_out, struct arg_track *at_stack_out) +{ + bool printed = false; + int i; + + if (!(env->log.level & BPF_LOG_LEVEL2)) + return; + for (i = 0; i < MAX_BPF_REG; i++) { + if (arg_track_eq(&at_out[i], &at_in[i])) + continue; + if (!printed) { + verbose(env, "%3d: ", idx); + bpf_verbose_insn(env, insn); + bpf_vlog_reset(&env->log, env->log.end_pos - 1); + printed = true; + } + verbose(env, "\tr%d: ", i); verbose_arg_track(env, &at_in[i]); + verbose(env, " -> "); verbose_arg_track(env, &at_out[i]); + } + for (i = 0; i < MAX_ARG_SPILL_SLOTS; i++) { + if (arg_track_eq(&at_stack_out[i], &at_stack_in[i])) + continue; + if (!printed) { + verbose(env, "%3d: ", idx); + bpf_verbose_insn(env, insn); + bpf_vlog_reset(&env->log, env->log.end_pos - 1); + printed = true; + } + verbose(env, "\tfp%+d: ", -(i + 1) * 8); verbose_arg_track(env, &at_stack_in[i]); + verbose(env, " -> "); verbose_arg_track(env, &at_stack_out[i]); + } + if (printed) + verbose(env, "\n"); +} + +static bool can_be_local_fp(int depth, int regno, struct arg_track *at) +{ + return regno == BPF_REG_FP || at->frame == depth || + (at->frame == ARG_IMPRECISE && (at->mask & BIT(depth))); +} + +/* + * Pure dataflow transfer function for arg_track state. + * Updates at_out[] based on how the instruction modifies registers. + * Tracks spill/fill, but not other memory accesses. + */ +static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn, + int insn_idx, + struct arg_track *at_out, struct arg_track *at_stack_out, + struct func_instance *instance, + u32 *callsites) +{ + int depth = instance->depth; + u8 class = BPF_CLASS(insn->code); + u8 code = BPF_OP(insn->code); + struct arg_track *dst = &at_out[insn->dst_reg]; + struct arg_track *src = &at_out[insn->src_reg]; + struct arg_track none = { .frame = ARG_NONE }; + int r; + + if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) { + if (code == BPF_MOV) { + *dst = none; + } else if (dst->frame >= 0) { + if (code == BPF_ADD) + arg_padd(dst, insn->imm); + else if (code == BPF_SUB) + arg_padd(dst, -(s64)insn->imm); + else + /* Any other 64-bit alu on the pointer makes it imprecise */ + dst->off_cnt = 0; + } /* else if dst->frame is imprecise it stays so */ + } else if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_X) { + if (code == BPF_MOV) { + if (insn->off == 0) { + *dst = *src; + } else { + /* addr_space_cast destroys a pointer */ + *dst = none; + } + } else { + arg_track_alu64(dst, src); + } + } else if (class == BPF_ALU) { + /* + * 32-bit alu destroys the pointer. + * If src was a pointer it cannot leak into dst + */ + *dst = none; + } else if (class == BPF_JMP && code == BPF_CALL) { + /* + * at_stack_out[slot] is not cleared by the helper and subprog calls. + * The fill_from_stack() may return the stale spill — which is an FP-derived arg_track + * (the value that was originally spilled there). The loaded register then carries + * a phantom FP-derived identity that doesn't correspond to what's actually in the slot. + * This phantom FP pointer propagates forward, and wherever it's subsequently used + * (as a helper argument, another store, etc.), it sets stack liveness bits. + * Those bits correspond to stack accesses that don't actually happen. + * So the effect is over-reporting stack liveness — marking slots as live that aren't + * actually accessed. The verifier preserves more state than necessary across calls, + * which is conservative. + * + * helpers can scratch stack slots, but they won't make a valid pointer out of it. + * subprogs are allowed to write into parent slots, but they cannot write + * _any_ FP-derived pointer into it (either their own or parent's FP). + */ + for (r = BPF_REG_0; r <= BPF_REG_5; r++) + at_out[r] = none; + } else if (class == BPF_LDX) { + u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code)); + bool src_is_local_fp = can_be_local_fp(depth, insn->src_reg, src); + + /* + * Reload from callee stack: if src is current-frame FP-derived + * and the load is an 8-byte BPF_MEM, try to restore the spill + * identity. For imprecise sources fill_from_stack() returns + * ARG_IMPRECISE (off_cnt == 0). + */ + if (src_is_local_fp && BPF_MODE(insn->code) == BPF_MEM && sz == 8) { + *dst = fill_from_stack(insn, at_out, insn->src_reg, at_stack_out, depth); + } else if (src->frame >= 0 && src->frame < depth && + BPF_MODE(insn->code) == BPF_MEM && sz == 8) { + struct arg_track *parent_stack = + env->callsite_at_stack[callsites[src->frame]]; + + *dst = fill_from_stack(insn, at_out, insn->src_reg, + parent_stack, src->frame); + } else if (src->frame == ARG_IMPRECISE && + !(src->mask & BIT(depth)) && src->mask && + BPF_MODE(insn->code) == BPF_MEM && sz == 8) { + /* + * Imprecise src with only parent-frame bits: + * conservative fallback. + */ + *dst = *src; + } else { + *dst = none; + } + } else if (class == BPF_LD && BPF_MODE(insn->code) == BPF_IMM) { + *dst = none; + } else if (class == BPF_STX) { + u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code)); + bool dst_is_local_fp; + + /* Track spills to current-frame FP-derived callee stack */ + dst_is_local_fp = can_be_local_fp(depth, insn->dst_reg, dst); + if (dst_is_local_fp && BPF_MODE(insn->code) == BPF_MEM) + spill_to_stack(insn, at_out, insn->dst_reg, + at_stack_out, src, sz); + + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + if (dst_is_local_fp && insn->imm != BPF_LOAD_ACQ) + clear_stack_for_all_offs(insn, at_out, insn->dst_reg, + at_stack_out, sz); + + if (insn->imm == BPF_CMPXCHG) + at_out[BPF_REG_0] = none; + else if (insn->imm == BPF_LOAD_ACQ) + *dst = none; + else if (insn->imm & BPF_FETCH) + *src = none; + } + } else if (class == BPF_ST && BPF_MODE(insn->code) == BPF_MEM) { + u32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code)); + bool dst_is_local_fp = can_be_local_fp(depth, insn->dst_reg, dst); + + /* BPF_ST to FP-derived dst: clear overlapping stack slots */ + if (dst_is_local_fp) + clear_stack_for_all_offs(insn, at_out, insn->dst_reg, + at_stack_out, sz); + } +} + +/* + * Record access_bytes from helper/kfunc or load/store insn. + * access_bytes > 0: stack read + * access_bytes < 0: stack write + * access_bytes == S64_MIN: unknown — conservative, mark [0..slot] as read + * access_bytes == 0: no access + * + */ +static int record_stack_access_off(struct func_instance *instance, s64 fp_off, + s64 access_bytes, u32 frame, u32 insn_idx) +{ + s32 slot_hi, slot_lo; + spis_t mask; + + if (fp_off >= 0) + /* + * out of bounds stack access doesn't contribute + * into actual stack liveness. It will be rejected + * by the main verifier pass later. + */ + return 0; + if (access_bytes == S64_MIN) { + /* helper/kfunc read unknown amount of bytes from fp_off until fp+0 */ + slot_hi = (-fp_off - 1) / STACK_SLOT_SZ; + mask = SPIS_ZERO; + spis_or_range(&mask, 0, slot_hi); + return mark_stack_read(instance, frame, insn_idx, mask); + } + if (access_bytes > 0) { + /* Mark any touched slot as use */ + slot_hi = (-fp_off - 1) / STACK_SLOT_SZ; + slot_lo = max_t(s32, (-fp_off - access_bytes) / STACK_SLOT_SZ, 0); + mask = SPIS_ZERO; + spis_or_range(&mask, slot_lo, slot_hi); + return mark_stack_read(instance, frame, insn_idx, mask); + } else if (access_bytes < 0) { + /* Mark only fully covered slots as def */ + access_bytes = -access_bytes; + slot_hi = (-fp_off) / STACK_SLOT_SZ - 1; + slot_lo = max_t(s32, (-fp_off - access_bytes + STACK_SLOT_SZ - 1) / STACK_SLOT_SZ, 0); + if (slot_lo <= slot_hi) { + mask = SPIS_ZERO; + spis_or_range(&mask, slot_lo, slot_hi); + return mark_stack_write(instance, frame, insn_idx, mask); + } + } + return 0; +} + +/* + * 'arg' is FP-derived argument to helper/kfunc or load/store that + * reads (positive) or writes (negative) 'access_bytes' into 'use' or 'def'. + */ +static int record_stack_access(struct func_instance *instance, + const struct arg_track *arg, + s64 access_bytes, u32 frame, u32 insn_idx) +{ + int i, err; + + if (access_bytes == 0) + return 0; + if (arg->off_cnt == 0) { + if (access_bytes > 0 || access_bytes == S64_MIN) + return mark_stack_read(instance, frame, insn_idx, SPIS_ALL); + return 0; + } + if (access_bytes != S64_MIN && access_bytes < 0 && arg->off_cnt != 1) + /* multi-offset write cannot set stack_def */ + return 0; + + for (i = 0; i < arg->off_cnt; i++) { + err = record_stack_access_off(instance, arg->off[i], access_bytes, frame, insn_idx); + if (err) + return err; + } + return 0; +} + +/* + * When a pointer is ARG_IMPRECISE, conservatively mark every frame in + * the bitmask as fully used. + */ +static int record_imprecise(struct func_instance *instance, u32 mask, u32 insn_idx) +{ + int depth = instance->depth; + int f, err; + + for (f = 0; mask; f++, mask >>= 1) { + if (!(mask & 1)) + continue; + if (f <= depth) { + err = mark_stack_read(instance, f, insn_idx, SPIS_ALL); + if (err) + return err; + } + } + return 0; +} + +/* Record load/store access for a given 'at' state of 'insn'. */ +static int record_load_store_access(struct bpf_verifier_env *env, + struct func_instance *instance, + struct arg_track *at, int insn_idx) +{ + struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; + int depth = instance->depth; + s32 sz = bpf_size_to_bytes(BPF_SIZE(insn->code)); + u8 class = BPF_CLASS(insn->code); + struct arg_track resolved, *ptr; + int oi; + + switch (class) { + case BPF_LDX: + ptr = &at[insn->src_reg]; + break; + case BPF_STX: + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + if (insn->imm == BPF_STORE_REL) + sz = -sz; + if (insn->imm == BPF_LOAD_ACQ) + ptr = &at[insn->src_reg]; + else + ptr = &at[insn->dst_reg]; + } else { + ptr = &at[insn->dst_reg]; + sz = -sz; + } + break; + case BPF_ST: + ptr = &at[insn->dst_reg]; + sz = -sz; + break; + default: + return 0; + } + + /* Resolve offsets: fold insn->off into arg_track */ + if (ptr->off_cnt > 0) { + resolved.off_cnt = ptr->off_cnt; + resolved.frame = ptr->frame; + for (oi = 0; oi < ptr->off_cnt; oi++) { + if (arg_add(ptr->off[oi], insn->off, &resolved.off[oi])) { + resolved.off_cnt = 0; + break; + } + } + ptr = &resolved; + } + + if (ptr->frame >= 0 && ptr->frame <= depth) + return record_stack_access(instance, ptr, sz, ptr->frame, insn_idx); + if (ptr->frame == ARG_IMPRECISE) + return record_imprecise(instance, ptr->mask, insn_idx); + /* ARG_NONE: not derived from any frame pointer, skip */ + return 0; +} + +/* Record stack access for a given 'at' state of helper/kfunc 'insn' */ +static int record_call_access(struct bpf_verifier_env *env, + struct func_instance *instance, + struct arg_track *at, + int insn_idx) +{ + struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; + int depth = instance->depth; + struct bpf_call_summary cs; + int r, err = 0, num_params = 5; + + if (bpf_pseudo_call(insn)) + return 0; + + if (bpf_get_call_summary(env, insn, &cs)) + num_params = cs.num_params; + + for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) { + int frame = at[r].frame; + s64 bytes; + + if (!arg_is_fp(&at[r])) + continue; + + if (bpf_helper_call(insn)) { + bytes = bpf_helper_stack_access_bytes(env, insn, r - 1, insn_idx); + } else if (bpf_pseudo_kfunc_call(insn)) { + bytes = bpf_kfunc_stack_access_bytes(env, insn, r - 1, insn_idx); + } else { + for (int f = 0; f <= depth; f++) { + err = mark_stack_read(instance, f, insn_idx, SPIS_ALL); + if (err) + return err; + } + return 0; + } + if (bytes == 0) + continue; + + if (frame >= 0 && frame <= depth) + err = record_stack_access(instance, &at[r], bytes, frame, insn_idx); + else if (frame == ARG_IMPRECISE) + err = record_imprecise(instance, at[r].mask, insn_idx); + if (err) + return err; + } + return 0; +} + +/* + * For a calls_callback helper, find the callback subprog and determine + * which caller register maps to which callback register for FP passthrough. + */ +static int find_callback_subprog(struct bpf_verifier_env *env, + struct bpf_insn *insn, int insn_idx, + int *caller_reg, int *callee_reg) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + int cb_reg = -1; + + *caller_reg = -1; + *callee_reg = -1; + + if (!bpf_helper_call(insn)) + return -1; + switch (insn->imm) { + case BPF_FUNC_loop: + /* bpf_loop(nr, cb, ctx, flags): cb=R2, R3->cb R2 */ + cb_reg = BPF_REG_2; + *caller_reg = BPF_REG_3; + *callee_reg = BPF_REG_2; + break; + case BPF_FUNC_for_each_map_elem: + /* for_each_map_elem(map, cb, ctx, flags): cb=R2, R3->cb R4 */ + cb_reg = BPF_REG_2; + *caller_reg = BPF_REG_3; + *callee_reg = BPF_REG_4; + break; + case BPF_FUNC_find_vma: + /* find_vma(task, addr, cb, ctx, flags): cb=R3, R4->cb R3 */ + cb_reg = BPF_REG_3; + *caller_reg = BPF_REG_4; + *callee_reg = BPF_REG_3; + break; + case BPF_FUNC_user_ringbuf_drain: + /* user_ringbuf_drain(map, cb, ctx, flags): cb=R2, R3->cb R2 */ + cb_reg = BPF_REG_2; + *caller_reg = BPF_REG_3; + *callee_reg = BPF_REG_2; + break; + default: + return -1; + } + + if (!(aux->const_reg_subprog_mask & BIT(cb_reg))) + return -2; + + return aux->const_reg_vals[cb_reg]; +} + +/* Per-subprog intermediate state kept alive across analysis phases */ +struct subprog_at_info { + struct arg_track (*at_in)[MAX_BPF_REG]; + int len; +}; + +static void print_subprog_arg_access(struct bpf_verifier_env *env, + int subprog, + struct subprog_at_info *info, + struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS]) +{ + struct bpf_insn *insns = env->prog->insnsi; + int start = env->subprog_info[subprog].start; + int len = info->len; + int i, r; + + if (!(env->log.level & BPF_LOG_LEVEL2)) + return; + + verbose(env, "%s:\n", fmt_subprog(env, subprog)); + for (i = 0; i < len; i++) { + int idx = start + i; + bool has_extra = false; + u8 cls = BPF_CLASS(insns[idx].code); + bool is_ldx_stx_call = cls == BPF_LDX || cls == BPF_STX || + insns[idx].code == (BPF_JMP | BPF_CALL); + + verbose(env, "%3d: ", idx); + bpf_verbose_insn(env, &insns[idx]); + + /* Collect what needs printing */ + if (is_ldx_stx_call && + arg_is_visited(&info->at_in[i][0])) { + for (r = 0; r < MAX_BPF_REG - 1; r++) + if (arg_is_fp(&info->at_in[i][r])) + has_extra = true; + } + if (is_ldx_stx_call) { + for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) + if (arg_is_fp(&at_stack_in[i][r])) + has_extra = true; + } + + if (!has_extra) { + if (bpf_is_ldimm64(&insns[idx])) + i++; + continue; + } + + bpf_vlog_reset(&env->log, env->log.end_pos - 1); + verbose(env, " //"); + + if (is_ldx_stx_call && info->at_in && + arg_is_visited(&info->at_in[i][0])) { + for (r = 0; r < MAX_BPF_REG - 1; r++) { + if (!arg_is_fp(&info->at_in[i][r])) + continue; + verbose(env, " r%d=", r); + verbose_arg_track(env, &info->at_in[i][r]); + } + } + + if (is_ldx_stx_call) { + for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) { + if (!arg_is_fp(&at_stack_in[i][r])) + continue; + verbose(env, " fp%+d=", -(r + 1) * 8); + verbose_arg_track(env, &at_stack_in[i][r]); + } + } + + verbose(env, "\n"); + if (bpf_is_ldimm64(&insns[idx])) + i++; + } +} + +/* + * Compute arg tracking dataflow for a single subprog. + * Runs forward fixed-point with arg_track_xfer(), then records + * memory accesses in a single linear pass over converged state. + * + * @callee_entry: pre-populated entry state for R1-R5 + * NULL for main (subprog 0). + * @info: stores at_in, len for debug printing. + */ +static int compute_subprog_args(struct bpf_verifier_env *env, + struct subprog_at_info *info, + struct arg_track *callee_entry, + struct func_instance *instance, + u32 *callsites) +{ + int subprog = instance->subprog; + struct bpf_insn *insns = env->prog->insnsi; + int depth = instance->depth; + int start = env->subprog_info[subprog].start; + int po_start = env->subprog_info[subprog].postorder_start; + int end = env->subprog_info[subprog + 1].start; + int po_end = env->subprog_info[subprog + 1].postorder_start; + int len = end - start; + struct arg_track (*at_in)[MAX_BPF_REG] = NULL; + struct arg_track at_out[MAX_BPF_REG]; + struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS] = NULL; + struct arg_track *at_stack_out = NULL; + struct arg_track unvisited = { .frame = ARG_UNVISITED }; + struct arg_track none = { .frame = ARG_NONE }; + bool changed; + int i, p, r, err = -ENOMEM; + + at_in = kvmalloc_objs(*at_in, len, GFP_KERNEL_ACCOUNT); + if (!at_in) + goto err_free; + + at_stack_in = kvmalloc_objs(*at_stack_in, len, GFP_KERNEL_ACCOUNT); + if (!at_stack_in) + goto err_free; + + at_stack_out = kvmalloc_objs(*at_stack_out, MAX_ARG_SPILL_SLOTS, GFP_KERNEL_ACCOUNT); + if (!at_stack_out) + goto err_free; + + for (i = 0; i < len; i++) { + for (r = 0; r < MAX_BPF_REG; r++) + at_in[i][r] = unvisited; + for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) + at_stack_in[i][r] = unvisited; + } + + for (r = 0; r < MAX_BPF_REG; r++) + at_in[0][r] = none; + + /* Entry: R10 is always precisely the current frame's FP */ + at_in[0][BPF_REG_FP] = arg_single(depth, 0); + + /* R1-R5: from caller or ARG_NONE for main */ + if (callee_entry) { + for (r = BPF_REG_1; r <= BPF_REG_5; r++) + at_in[0][r] = callee_entry[r]; + } + + /* Entry: all stack slots are ARG_NONE */ + for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) + at_stack_in[0][r] = none; + + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "subprog#%d: analyzing (depth %d)...\n", subprog, depth); + + /* Forward fixed-point iteration in reverse post order */ +redo: + changed = false; + for (p = po_end - 1; p >= po_start; p--) { + int idx = env->cfg.insn_postorder[p]; + int i = idx - start; + struct bpf_insn *insn = &insns[idx]; + struct bpf_iarray *succ; + + if (!arg_is_visited(&at_in[i][0]) && !arg_is_visited(&at_in[i][1])) + continue; + + memcpy(at_out, at_in[i], sizeof(at_out)); + memcpy(at_stack_out, at_stack_in[i], MAX_ARG_SPILL_SLOTS * sizeof(*at_stack_out)); + + arg_track_xfer(env, insn, idx, at_out, at_stack_out, instance, callsites); + arg_track_log(env, insn, idx, at_in[i], at_stack_in[i], at_out, at_stack_out); + + /* Propagate to successors within this subprogram */ + succ = bpf_insn_successors(env, idx); + for (int s = 0; s < succ->cnt; s++) { + int target = succ->items[s]; + int ti; + + /* Filter: stay within the subprogram's range */ + if (target < start || target >= end) + continue; + ti = target - start; + + for (r = 0; r < MAX_BPF_REG; r++) + changed |= arg_track_join(env, idx, target, r, + &at_in[ti][r], at_out[r]); + + for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) + changed |= arg_track_join(env, idx, target, -r - 1, + &at_stack_in[ti][r], at_stack_out[r]); + } + } + if (changed) + goto redo; + + /* Record memory accesses using converged at_in (RPO skips dead code) */ + for (p = po_end - 1; p >= po_start; p--) { + int idx = env->cfg.insn_postorder[p]; + int i = idx - start; + struct bpf_insn *insn = &insns[idx]; + + err = record_load_store_access(env, instance, at_in[i], idx); + if (err) + goto err_free; + + if (insn->code == (BPF_JMP | BPF_CALL)) { + err = record_call_access(env, instance, at_in[i], idx); + if (err) + goto err_free; + } + + if (bpf_pseudo_call(insn) || bpf_calls_callback(env, idx)) { + kvfree(env->callsite_at_stack[idx]); + env->callsite_at_stack[idx] = + kvmalloc_objs(*env->callsite_at_stack[idx], + MAX_ARG_SPILL_SLOTS, GFP_KERNEL_ACCOUNT); + if (!env->callsite_at_stack[idx]) { + err = -ENOMEM; + goto err_free; + } + memcpy(env->callsite_at_stack[idx], + at_stack_in[i], sizeof(struct arg_track) * MAX_ARG_SPILL_SLOTS); + } + } + + info->at_in = at_in; + at_in = NULL; + info->len = len; + print_subprog_arg_access(env, subprog, info, at_stack_in); + err = 0; + +err_free: + kvfree(at_stack_out); + kvfree(at_stack_in); + kvfree(at_in); + return err; +} + +/* Return true if any of R1-R5 is derived from a frame pointer. */ +static bool has_fp_args(struct arg_track *args) +{ + for (int r = BPF_REG_1; r <= BPF_REG_5; r++) + if (args[r].frame != ARG_NONE) + return true; + return false; +} + +/* + * Merge a freshly analyzed instance into the original. + * may_read: union (any pass might read the slot). + * must_write: intersection (only slots written on ALL passes are guaranteed). + * live_before is recomputed by a subsequent update_instance() on @dst. + */ +static void merge_instances(struct func_instance *dst, struct func_instance *src) +{ + int f, i; + + for (f = 0; f <= dst->depth; f++) { + if (!src->frames[f]) { + /* This pass didn't touch frame f — must_write intersects with empty. */ + if (dst->frames[f]) + for (i = 0; i < dst->insn_cnt; i++) + dst->frames[f][i].must_write = SPIS_ZERO; + continue; + } + if (!dst->frames[f]) { + /* Previous pass didn't touch frame f — take src, zero must_write. */ + dst->frames[f] = src->frames[f]; + src->frames[f] = NULL; + for (i = 0; i < dst->insn_cnt; i++) + dst->frames[f][i].must_write = SPIS_ZERO; + continue; + } + for (i = 0; i < dst->insn_cnt; i++) { + dst->frames[f][i].may_read = + spis_or(dst->frames[f][i].may_read, + src->frames[f][i].may_read); + dst->frames[f][i].must_write = + spis_and(dst->frames[f][i].must_write, + src->frames[f][i].must_write); + } + } +} + +static struct func_instance *fresh_instance(struct func_instance *src) +{ + struct func_instance *f; + + f = kvzalloc_obj(*f, GFP_KERNEL_ACCOUNT); + if (!f) + return ERR_PTR(-ENOMEM); + f->callsite = src->callsite; + f->depth = src->depth; + f->subprog = src->subprog; + f->subprog_start = src->subprog_start; + f->insn_cnt = src->insn_cnt; + return f; +} + +static void free_instance(struct func_instance *instance) +{ + int i; + + for (i = 0; i <= instance->depth; i++) + kvfree(instance->frames[i]); + kvfree(instance); +} + +/* + * Recursively analyze a subprog with specific 'entry_args'. + * Each callee is analyzed with the exact args from its call site. + * + * Args are recomputed for each call because the dataflow result at_in[] + * depends on the entry args and frame depth. Consider: A->C->D and B->C->D + * Callsites in A and B pass different args into C, so C is recomputed. + * Then within C the same callsite passes different args into D. + */ +static int analyze_subprog(struct bpf_verifier_env *env, + struct arg_track *entry_args, + struct subprog_at_info *info, + struct func_instance *instance, + u32 *callsites) +{ + int subprog = instance->subprog; + int depth = instance->depth; + struct bpf_insn *insns = env->prog->insnsi; + int start = env->subprog_info[subprog].start; + int po_start = env->subprog_info[subprog].postorder_start; + int po_end = env->subprog_info[subprog + 1].postorder_start; + struct func_instance *prev_instance = NULL; + int j, err; + + if (++env->liveness->subprog_calls > 10000) { + verbose(env, "liveness analysis exceeded complexity limit (%d calls)\n", + env->liveness->subprog_calls); + return -E2BIG; + } + + if (need_resched()) + cond_resched(); + + + /* + * When an instance is reused (must_write_initialized == true), + * record into a fresh instance and merge afterward. This avoids + * stale must_write marks for instructions not reached in this pass. + */ + if (instance->must_write_initialized) { + struct func_instance *fresh = fresh_instance(instance); + + if (IS_ERR(fresh)) + return PTR_ERR(fresh); + prev_instance = instance; + instance = fresh; + } + + /* Free prior analysis if this subprog was already visited */ + kvfree(info[subprog].at_in); + info[subprog].at_in = NULL; + + err = compute_subprog_args(env, &info[subprog], entry_args, instance, callsites); + if (err) + goto out_free; + + /* For each reachable call site in the subprog, recurse into callees */ + for (int p = po_start; p < po_end; p++) { + int idx = env->cfg.insn_postorder[p]; + struct arg_track callee_args[BPF_REG_5 + 1]; + struct arg_track none = { .frame = ARG_NONE }; + struct bpf_insn *insn = &insns[idx]; + struct func_instance *callee_instance; + int callee, target; + int caller_reg, cb_callee_reg; + + j = idx - start; /* relative index within this subprog */ + + if (bpf_pseudo_call(insn)) { + target = idx + insn->imm + 1; + callee = bpf_find_subprog(env, target); + if (callee < 0) + continue; + + /* Build entry args: R1-R5 from at_in at call site */ + for (int r = BPF_REG_1; r <= BPF_REG_5; r++) + callee_args[r] = info[subprog].at_in[j][r]; + } else if (bpf_calls_callback(env, idx)) { + callee = find_callback_subprog(env, insn, idx, &caller_reg, &cb_callee_reg); + if (callee == -2) { + /* + * same bpf_loop() calls two different callbacks and passes + * stack pointer to them + */ + if (info[subprog].at_in[j][caller_reg].frame == ARG_NONE) + continue; + for (int f = 0; f <= depth; f++) { + err = mark_stack_read(instance, f, idx, SPIS_ALL); + if (err) + goto out_free; + } + continue; + } + if (callee < 0) + continue; + + for (int r = BPF_REG_1; r <= BPF_REG_5; r++) + callee_args[r] = none; + callee_args[cb_callee_reg] = info[subprog].at_in[j][caller_reg]; + } else { + continue; + } + + if (!has_fp_args(callee_args)) + continue; + + if (depth == MAX_CALL_FRAMES - 1) { + err = -EINVAL; + goto out_free; + } + + callee_instance = call_instance(env, instance, idx, callee); + if (IS_ERR(callee_instance)) { + err = PTR_ERR(callee_instance); + goto out_free; + } + callsites[depth] = idx; + err = analyze_subprog(env, callee_args, info, callee_instance, callsites); + if (err) + goto out_free; + + /* Pull callee's entry liveness back to caller's callsite */ + { + u32 callee_start = callee_instance->subprog_start; + struct per_frame_masks *entry; + + for (int f = 0; f < callee_instance->depth; f++) { + entry = get_frame_masks(callee_instance, f, callee_start); + if (!entry) + continue; + err = mark_stack_read(instance, f, idx, entry->live_before); + if (err) + goto out_free; + } + } + } + + if (prev_instance) { + merge_instances(prev_instance, instance); + free_instance(instance); + instance = prev_instance; + } + update_instance(env, instance); + return 0; + +out_free: + if (prev_instance) + free_instance(instance); + return err; +} + +int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env) +{ + u32 callsites[MAX_CALL_FRAMES] = {}; + int insn_cnt = env->prog->len; + struct func_instance *instance; + struct subprog_at_info *info; + int k, err = 0; + + info = kvzalloc_objs(*info, env->subprog_cnt, GFP_KERNEL_ACCOUNT); + if (!info) + return -ENOMEM; + + env->callsite_at_stack = kvzalloc_objs(*env->callsite_at_stack, insn_cnt, + GFP_KERNEL_ACCOUNT); + if (!env->callsite_at_stack) { + kvfree(info); + return -ENOMEM; + } + + instance = call_instance(env, NULL, 0, 0); + if (IS_ERR(instance)) { + err = PTR_ERR(instance); + goto out; + } + err = analyze_subprog(env, NULL, info, instance, callsites); + if (err) + goto out; + + /* + * Subprogs and callbacks that don't receive FP-derived arguments + * cannot access ancestor stack frames, so they were skipped during + * the recursive walk above. Async callbacks (timer, workqueue) are + * also not reachable from the main program's call graph. Analyze + * all unvisited subprogs as independent roots at depth 0. + * + * Use reverse topological order (callers before callees) so that + * each subprog is analyzed before its callees, allowing the + * recursive walk inside analyze_subprog() to naturally + * reach nested callees that also lack FP-derived args. + */ + for (k = env->subprog_cnt - 1; k >= 0; k--) { + int sub = env->subprog_topo_order[k]; + + if (info[sub].at_in && !bpf_subprog_is_global(env, sub)) + continue; + instance = call_instance(env, NULL, 0, sub); + if (IS_ERR(instance)) { + err = PTR_ERR(instance); + goto out; + } + err = analyze_subprog(env, NULL, info, instance, callsites); + if (err) + goto out; + } + + if (env->log.level & BPF_LOG_LEVEL2) + err = print_instances(env); + +out: + for (k = 0; k < insn_cnt; k++) + kvfree(env->callsite_at_stack[k]); + kvfree(env->callsite_at_stack); + env->callsite_at_stack = NULL; + for (k = 0; k < env->subprog_cnt; k++) + kvfree(info[k].at_in); + kvfree(info); + return err; +} + +/* Each field is a register bitmask */ +struct insn_live_regs { + u16 use; /* registers read by instruction */ + u16 def; /* registers written by instruction */ + u16 in; /* registers that may be alive before instruction */ + u16 out; /* registers that may be alive after instruction */ +}; + +/* Bitmask with 1s for all caller saved registers */ +#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) + +/* Compute info->{use,def} fields for the instruction */ +static void compute_insn_live_regs(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct insn_live_regs *info) +{ + struct bpf_call_summary cs; + u8 class = BPF_CLASS(insn->code); + u8 code = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u16 src = BIT(insn->src_reg); + u16 dst = BIT(insn->dst_reg); + u16 r0 = BIT(0); + u16 def = 0; + u16 use = 0xffff; + + switch (class) { + case BPF_LD: + switch (mode) { + case BPF_IMM: + if (BPF_SIZE(insn->code) == BPF_DW) { + def = dst; + use = 0; + } + break; + case BPF_LD | BPF_ABS: + case BPF_LD | BPF_IND: + /* stick with defaults */ + break; + } + break; + case BPF_LDX: + switch (mode) { + case BPF_MEM: + case BPF_MEMSX: + def = dst; + use = src; + break; + } + break; + case BPF_ST: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst; + break; + } + break; + case BPF_STX: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst | src; + break; + case BPF_ATOMIC: + switch (insn->imm) { + case BPF_CMPXCHG: + use = r0 | dst | src; + def = r0; + break; + case BPF_LOAD_ACQ: + def = dst; + use = src; + break; + case BPF_STORE_REL: + def = 0; + use = dst | src; + break; + default: + use = dst | src; + if (insn->imm & BPF_FETCH) + def = src; + else + def = 0; + } + break; + } + break; + case BPF_ALU: + case BPF_ALU64: + switch (code) { + case BPF_END: + use = dst; + def = dst; + break; + case BPF_MOV: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = 0; + else + use = src; + break; + default: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + case BPF_JMP: + case BPF_JMP32: + switch (code) { + case BPF_JA: + def = 0; + if (BPF_SRC(insn->code) == BPF_X) + use = dst; + else + use = 0; + break; + case BPF_JCOND: + def = 0; + use = 0; + break; + case BPF_EXIT: + def = 0; + use = r0; + break; + case BPF_CALL: + def = ALL_CALLER_SAVED_REGS; + use = def & ~BIT(BPF_REG_0); + if (bpf_get_call_summary(env, insn, &cs)) + use = GENMASK(cs.num_params, 1); + break; + default: + def = 0; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + } + + info->def = def; + info->use = use; +} + +/* Compute may-live registers after each instruction in the program. + * The register is live after the instruction I if it is read by some + * instruction S following I during program execution and is not + * overwritten between I and S. + * + * Store result in env->insn_aux_data[i].live_regs. + */ +int bpf_compute_live_registers(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *insn_aux = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + struct insn_live_regs *state; + int insn_cnt = env->prog->len; + int err = 0, i, j; + bool changed; + + /* Use the following algorithm: + * - define the following: + * - I.use : a set of all registers read by instruction I; + * - I.def : a set of all registers written by instruction I; + * - I.in : a set of all registers that may be alive before I execution; + * - I.out : a set of all registers that may be alive after I execution; + * - insn_successors(I): a set of instructions S that might immediately + * follow I for some program execution; + * - associate separate empty sets 'I.in' and 'I.out' with each instruction; + * - visit each instruction in a postorder and update + * state[i].in, state[i].out as follows: + * + * state[i].out = U [state[s].in for S in insn_successors(i)] + * state[i].in = (state[i].out / state[i].def) U state[i].use + * + * (where U stands for set union, / stands for set difference) + * - repeat the computation while {in,out} fields changes for + * any instruction. + */ + state = kvzalloc_objs(*state, insn_cnt, GFP_KERNEL_ACCOUNT); + if (!state) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < insn_cnt; ++i) + compute_insn_live_regs(env, &insns[i], &state[i]); + + /* Forward pass: resolve stack access through FP-derived pointers */ + err = bpf_compute_subprog_arg_access(env); + if (err) + goto out; + + changed = true; + while (changed) { + changed = false; + for (i = 0; i < env->cfg.cur_postorder; ++i) { + int insn_idx = env->cfg.insn_postorder[i]; + struct insn_live_regs *live = &state[insn_idx]; + struct bpf_iarray *succ; + u16 new_out = 0; + u16 new_in = 0; + + succ = bpf_insn_successors(env, insn_idx); + for (int s = 0; s < succ->cnt; ++s) + new_out |= state[succ->items[s]].in; + new_in = (new_out & ~live->def) | live->use; + if (new_out != live->out || new_in != live->in) { + live->in = new_in; + live->out = new_out; + changed = true; + } + } + } + + for (i = 0; i < insn_cnt; ++i) + insn_aux[i].live_regs_before = state[i].in; + + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "Live regs before insn:\n"); + for (i = 0; i < insn_cnt; ++i) { + if (env->insn_aux_data[i].scc) + verbose(env, "%3d ", env->insn_aux_data[i].scc); + else + verbose(env, " "); + verbose(env, "%3d: ", i); + for (j = BPF_REG_0; j < BPF_REG_10; ++j) + if (insn_aux[i].live_regs_before & BIT(j)) + verbose(env, "%d", j); + else + verbose(env, "."); + verbose(env, " "); + bpf_verbose_insn(env, &insns[i]); + if (bpf_is_ldimm64(&insns[i])) + i++; + } + } + +out: + kvfree(state); + return err; +} diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 1ccbf28b2ad9..23267213a17f 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -270,7 +270,7 @@ static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key, goto enoent; storage = list_next_entry(storage, list_map); - if (!storage) + if (list_entry_is_head(storage, &map->list, list_map)) goto enoent; } else { storage = list_first_entry(&map->list, @@ -364,7 +364,7 @@ static long cgroup_storage_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } -static int cgroup_storage_check_btf(const struct bpf_map *map, +static int cgroup_storage_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index a0c3b35de2ce..011e4ec25acd 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -329,47 +329,6 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, } EXPORT_SYMBOL_GPL(bpf_log); -static const struct bpf_line_info * -find_linfo(const struct bpf_verifier_env *env, u32 insn_off) -{ - const struct bpf_line_info *linfo; - const struct bpf_prog *prog; - u32 nr_linfo; - int l, r, m; - - prog = env->prog; - nr_linfo = prog->aux->nr_linfo; - - if (!nr_linfo || insn_off >= prog->len) - return NULL; - - linfo = prog->aux->linfo; - /* Loop invariant: linfo[l].insn_off <= insns_off. - * linfo[0].insn_off == 0 which always satisfies above condition. - * Binary search is searching for rightmost linfo entry that satisfies - * the above invariant, giving us the desired record that covers given - * instruction offset. - */ - l = 0; - r = nr_linfo - 1; - while (l < r) { - /* (r - l + 1) / 2 means we break a tie to the right, so if: - * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off, - * then m=2, we see that linfo[m].insn_off > insn_off, and so - * r becomes 1 and we exit the loop with correct l==1. - * If the tie was broken to the left, m=1 would end us up in - * an endless loop where l and m stay at 1 and r stays at 2. - */ - m = l + (r - l + 1) / 2; - if (linfo[m].insn_off <= insn_off) - l = m; - else - r = m - 1; - } - - return &linfo[l]; -} - static const char *ltrim(const char *s) { while (isspace(*s)) @@ -390,7 +349,7 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, return; prev_linfo = env->prev_linfo; - linfo = find_linfo(env, insn_off); + linfo = bpf_find_linfo(env->prog, insn_off); if (!linfo || linfo == prev_linfo) return; @@ -542,7 +501,8 @@ static char slot_type_char[] = { [STACK_ZERO] = '0', [STACK_DYNPTR] = 'd', [STACK_ITER] = 'i', - [STACK_IRQ_FLAG] = 'f' + [STACK_IRQ_FLAG] = 'f', + [STACK_POISON] = 'p', }; #define UNUM_MAX_DECIMAL U16_MAX @@ -581,6 +541,8 @@ int tnum_strn(char *str, size_t size, struct tnum a) if (a.mask == 0) { if (is_unum_decimal(a.value)) return snprintf(str, size, "%llu", a.value); + if (is_snum_decimal(a.value)) + return snprintf(str, size, "%lld", a.value); else return snprintf(str, size, "%#llx", a.value); } @@ -692,7 +654,7 @@ static void print_reg_state(struct bpf_verifier_env *env, if (state->frameno != reg->frameno) verbose(env, "[%d]", reg->frameno); if (tnum_is_const(reg->var_off)) { - verbose_snum(env, reg->var_off.value + reg->off); + verbose_snum(env, reg->var_off.value + reg->delta); return; } } @@ -702,7 +664,7 @@ static void print_reg_state(struct bpf_verifier_env *env, if (reg->id) verbose_a("id=%d", reg->id & ~BPF_ADD_CONST); if (reg->id & BPF_ADD_CONST) - verbose(env, "%+d", reg->off); + verbose(env, "%+d", reg->delta); if (reg->ref_obj_id) verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (type_is_non_owning_ref(reg->type)) @@ -714,9 +676,9 @@ static void print_reg_state(struct bpf_verifier_env *env, reg->map_ptr->key_size, reg->map_ptr->value_size); } - if (t != SCALAR_VALUE && reg->off) { + if (t != SCALAR_VALUE && reg->delta) { verbose_a("off="); - verbose_snum(env, reg->off); + verbose_snum(env, reg->delta); } if (type_is_pkt_pointer(t)) { verbose_a("r="); @@ -777,7 +739,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie for (j = 0; j < BPF_REG_SIZE; j++) { slot_type = state->stack[i].slot_type[j]; - if (slot_type != STACK_INVALID) + if (slot_type != STACK_INVALID && slot_type != STACK_POISON) valid = true; types_buf[j] = slot_type_char[slot_type]; } @@ -845,7 +807,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie mark_verifier_state_clean(env); } -static inline u32 vlog_alignment(u32 pos) +u32 bpf_vlog_alignment(u32 pos) { return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT), BPF_LOG_MIN_ALIGNMENT) - pos - 1; @@ -857,7 +819,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) { /* remove new line character */ bpf_vlog_reset(&env->log, env->prev_log_pos - 1); - verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' '); + verbose(env, "%*c;", bpf_vlog_alignment(env->prev_insn_print_pos), ' '); } else { verbose(env, "%d:", env->insn_idx); } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1adeb4d3b8cf..0f57608b385d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -751,7 +751,7 @@ free_stack: return err; } -static int trie_check_btf(const struct bpf_map *map, +static int trie_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index bd45dda9dc35..e9662db7198f 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -102,6 +102,8 @@ struct bpf_mem_cache { int percpu_size; bool draining; struct bpf_mem_cache *tgt; + void (*dtor)(void *obj, void *ctx); + void *dtor_ctx; /* list of objects to be freed after RCU GP */ struct llist_head free_by_rcu; @@ -260,12 +262,14 @@ static void free_one(void *obj, bool percpu) kfree(obj); } -static int free_all(struct llist_node *llnode, bool percpu) +static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu) { struct llist_node *pos, *t; int cnt = 0; llist_for_each_safe(pos, t, llnode) { + if (c->dtor) + c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx); free_one(pos, percpu); cnt++; } @@ -276,21 +280,10 @@ static void __free_rcu(struct rcu_head *head) { struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); - free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); + free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); atomic_set(&c->call_rcu_ttrace_in_progress, 0); } -static void __free_rcu_tasks_trace(struct rcu_head *head) -{ - /* If RCU Tasks Trace grace period implies RCU grace period, - * there is no need to invoke call_rcu(). - */ - if (rcu_trace_implies_rcu_gp()) - __free_rcu(head); - else - call_rcu(head, __free_rcu); -} - static void enque_to_free(struct bpf_mem_cache *c, void *obj) { struct llist_node *llnode = obj; @@ -308,7 +301,7 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c) if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { if (unlikely(READ_ONCE(c->draining))) { llnode = llist_del_all(&c->free_by_rcu_ttrace); - free_all(llnode, !!c->percpu_size); + free_all(c, llnode, !!c->percpu_size); } return; } @@ -322,12 +315,12 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c) return; } - /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. - * If RCU Tasks Trace grace period implies RCU grace period, free - * these elements directly, else use call_rcu() to wait for normal - * progs to finish and finally do free_one() on each element. + /* + * Use call_rcu_tasks_trace() to wait for sleepable progs to finish. + * RCU Tasks Trace grace period implies RCU grace period, so pass + * __free_rcu directly as the callback. */ - call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace); + call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu); } static void free_bulk(struct bpf_mem_cache *c) @@ -417,7 +410,7 @@ static void check_free_by_rcu(struct bpf_mem_cache *c) dec_active(c, &flags); if (unlikely(READ_ONCE(c->draining))) { - free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); + free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size); atomic_set(&c->call_rcu_in_progress, 0); } else { call_rcu_hurry(&c->rcu, __free_by_rcu); @@ -635,13 +628,13 @@ static void drain_mem_cache(struct bpf_mem_cache *c) * Except for waiting_for_gp_ttrace list, there are no concurrent operations * on these lists, so it is safe to use __llist_del_all(). */ - free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); - free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); - free_all(__llist_del_all(&c->free_llist), percpu); - free_all(__llist_del_all(&c->free_llist_extra), percpu); - free_all(__llist_del_all(&c->free_by_rcu), percpu); - free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); - free_all(llist_del_all(&c->waiting_for_gp), percpu); + free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu); + free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu); + free_all(c, __llist_del_all(&c->free_llist), percpu); + free_all(c, __llist_del_all(&c->free_llist_extra), percpu); + free_all(c, __llist_del_all(&c->free_by_rcu), percpu); + free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu); + free_all(c, llist_del_all(&c->waiting_for_gp), percpu); } static void check_mem_cache(struct bpf_mem_cache *c) @@ -680,6 +673,9 @@ static void check_leaked_objs(struct bpf_mem_alloc *ma) static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) { + /* We can free dtor ctx only once all callbacks are done using it. */ + if (ma->dtor_ctx_free) + ma->dtor_ctx_free(ma->dtor_ctx); check_leaked_objs(ma); free_percpu(ma->cache); free_percpu(ma->caches); @@ -689,20 +685,18 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) static void free_mem_alloc(struct bpf_mem_alloc *ma) { - /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks + /* + * waiting_for_gp[_ttrace] lists were drained, but RCU callbacks * might still execute. Wait for them. * * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used - * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), - * so if call_rcu(head, __free_rcu) is skipped due to - * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by - * using rcu_trace_implies_rcu_gp() as well. + * to wait for the pending __free_by_rcu(), and __free_rcu(). RCU Tasks + * Trace grace period implies RCU grace period, so all __free_rcu don't + * need extra call_rcu() (and thus extra rcu_barrier() here). */ rcu_barrier(); /* wait for __free_by_rcu */ rcu_barrier_tasks_trace(); /* wait for __free_rcu */ - if (!rcu_trace_implies_rcu_gp()) - rcu_barrier(); free_mem_alloc_no_barrier(ma); } @@ -1014,3 +1008,32 @@ int bpf_mem_alloc_check_size(bool percpu, size_t size) return 0; } + +void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx), + void (*dtor_ctx_free)(void *ctx), void *ctx) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i; + + ma->dtor_ctx_free = dtor_ctx_free; + ma->dtor_ctx = ctx; + + if (ma->cache) { + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + c->dtor = dtor; + c->dtor_ctx = ctx; + } + } + if (ma->caches) { + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + c->dtor = dtor; + c->dtor_ctx = ctx; + } + } + } +} diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 0ad97d643bf4..0d6f5569588c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -435,9 +435,8 @@ static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) if (aux->offload) { args->info->ifindex = aux->offload->netdev->ifindex; - net = dev_net(aux->offload->netdev); - get_net(net); - ns = &net->ns; + net = maybe_get_net(dev_net(aux->offload->netdev)); + ns = net ? &net->ns : NULL; } else { args->info->ifindex = 0; ns = NULL; @@ -647,9 +646,8 @@ static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) if (args->offmap->netdev) { args->info->ifindex = args->offmap->netdev->ifindex; - net = dev_net(args->offmap->netdev); - get_net(net); - ns = &net->ns; + net = maybe_get_net(dev_net(args->offmap->netdev)); + ns = net ? &net->ns : NULL; } else { args->info->ifindex = 0; ns = NULL; diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c new file mode 100644 index 000000000000..8478d2c6ed5b --- /dev/null +++ b/kernel/bpf/states.c @@ -0,0 +1,1563 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> +#include <linux/filter.h> + +#define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) + +#define BPF_COMPLEXITY_LIMIT_STATES 64 + +static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx) +{ + return bpf_is_may_goto_insn(&env->prog->insnsi[insn_idx]); +} + +static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].is_iter_next; +} + +static void update_peak_states(struct bpf_verifier_env *env) +{ + u32 cur_states; + + cur_states = env->explored_states_size + env->free_list_size + env->num_backedges; + env->peak_states = max(env->peak_states, cur_states); +} + +/* struct bpf_verifier_state->parent refers to states + * that are in either of env->{expored_states,free_list}. + * In both cases the state is contained in struct bpf_verifier_state_list. + */ +static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st) +{ + if (st->parent) + return container_of(st->parent, struct bpf_verifier_state_list, state); + return NULL; +} + +static bool incomplete_read_marks(struct bpf_verifier_env *env, + struct bpf_verifier_state *st); + +/* A state can be freed if it is no longer referenced: + * - is in the env->free_list; + * - has no children states; + */ +static void maybe_free_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state_list *sl) +{ + if (!sl->in_free_list + || sl->state.branches != 0 + || incomplete_read_marks(env, &sl->state)) + return; + list_del(&sl->node); + bpf_free_verifier_state(&sl->state, false); + kfree(sl); + env->free_list_size--; +} + +/* For state @st look for a topmost frame with frame_insn_idx() in some SCC, + * if such frame exists form a corresponding @callchain as an array of + * call sites leading to this frame and SCC id. + * E.g.: + * + * void foo() { A: loop {... SCC#1 ...}; } + * void bar() { B: loop { C: foo(); ... SCC#2 ... } + * D: loop { E: foo(); ... SCC#3 ... } } + * void main() { F: bar(); } + * + * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending + * on @st frame call sites being (F,C,A) or (F,E,A). + */ +static bool compute_scc_callchain(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + struct bpf_scc_callchain *callchain) +{ + u32 i, scc, insn_idx; + + memset(callchain, 0, sizeof(*callchain)); + for (i = 0; i <= st->curframe; i++) { + insn_idx = bpf_frame_insn_idx(st, i); + scc = env->insn_aux_data[insn_idx].scc; + if (scc) { + callchain->scc = scc; + break; + } else if (i < st->curframe) { + callchain->callsites[i] = insn_idx; + } else { + return false; + } + } + return true; +} + +/* Check if bpf_scc_visit instance for @callchain exists. */ +static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env, + struct bpf_scc_callchain *callchain) +{ + struct bpf_scc_info *info = env->scc_info[callchain->scc]; + struct bpf_scc_visit *visits = info->visits; + u32 i; + + if (!info) + return NULL; + for (i = 0; i < info->num_visits; i++) + if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0) + return &visits[i]; + return NULL; +} + +/* Allocate a new bpf_scc_visit instance corresponding to @callchain. + * Allocated instances are alive for a duration of the do_check_common() + * call and are freed by free_states(). + */ +static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env, + struct bpf_scc_callchain *callchain) +{ + struct bpf_scc_visit *visit; + struct bpf_scc_info *info; + u32 scc, num_visits; + u64 new_sz; + + scc = callchain->scc; + info = env->scc_info[scc]; + num_visits = info ? info->num_visits : 0; + new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1); + info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT); + if (!info) + return NULL; + env->scc_info[scc] = info; + info->num_visits = num_visits + 1; + visit = &info->visits[num_visits]; + memset(visit, 0, sizeof(*visit)); + memcpy(&visit->callchain, callchain, sizeof(*callchain)); + return visit; +} + +/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */ +static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain) +{ + char *buf = env->tmp_str_buf; + int i, delta = 0; + + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "("); + for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) { + if (!callchain->callsites[i]) + break; + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,", + callchain->callsites[i]); + } + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc); + return env->tmp_str_buf; +} + +/* If callchain for @st exists (@st is in some SCC), ensure that + * bpf_scc_visit instance for this callchain exists. + * If instance does not exist or is empty, assign visit->entry_state to @st. + */ +static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain *callchain = &env->callchain_buf; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, callchain)) + return 0; + visit = scc_visit_lookup(env, callchain); + visit = visit ?: scc_visit_alloc(env, callchain); + if (!visit) + return -ENOMEM; + if (!visit->entry_state) { + visit->entry_state = st; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC enter %s\n", format_callchain(env, callchain)); + } + return 0; +} + +static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit); + +/* If callchain for @st exists (@st is in some SCC), make it empty: + * - set visit->entry_state to NULL; + * - flush accumulated backedges. + */ +static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain *callchain = &env->callchain_buf; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, callchain)) + return 0; + visit = scc_visit_lookup(env, callchain); + if (!visit) { + /* + * If path traversal stops inside an SCC, corresponding bpf_scc_visit + * must exist for non-speculative paths. For non-speculative paths + * traversal stops when: + * a. Verification error is found, maybe_exit_scc() is not called. + * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member + * of any SCC. + * c. A checkpoint is reached and matched. Checkpoints are created by + * is_state_visited(), which calls maybe_enter_scc(), which allocates + * bpf_scc_visit instances for checkpoints within SCCs. + * (c) is the only case that can reach this point. + */ + if (!st->speculative) { + verifier_bug(env, "scc exit: no visit info for call chain %s", + format_callchain(env, callchain)); + return -EFAULT; + } + return 0; + } + if (visit->entry_state != st) + return 0; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC exit %s\n", format_callchain(env, callchain)); + visit->entry_state = NULL; + env->num_backedges -= visit->num_backedges; + visit->num_backedges = 0; + update_peak_states(env); + return propagate_backedges(env, visit); +} + +/* Lookup an bpf_scc_visit instance corresponding to @st callchain + * and add @backedge to visit->backedges. @st callchain must exist. + */ +static int add_scc_backedge(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + struct bpf_scc_backedge *backedge) +{ + struct bpf_scc_callchain *callchain = &env->callchain_buf; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, callchain)) { + verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d", + st->insn_idx); + return -EFAULT; + } + visit = scc_visit_lookup(env, callchain); + if (!visit) { + verifier_bug(env, "add backedge: no visit info for call chain %s", + format_callchain(env, callchain)); + return -EFAULT; + } + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC backedge %s\n", format_callchain(env, callchain)); + backedge->next = visit->backedges; + visit->backedges = backedge; + visit->num_backedges++; + env->num_backedges++; + update_peak_states(env); + return 0; +} + +/* bpf_reg_state->live marks for registers in a state @st are incomplete, + * if state @st is in some SCC and not all execution paths starting at this + * SCC are fully explored. + */ +static bool incomplete_read_marks(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain *callchain = &env->callchain_buf; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, callchain)) + return false; + visit = scc_visit_lookup(env, callchain); + if (!visit) + return false; + return !!visit->backedges; +} + +int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_verifier_state_list *sl = NULL, *parent_sl; + struct bpf_verifier_state *parent; + int err; + + while (st) { + u32 br = --st->branches; + + /* verifier_bug_if(br > 1, ...) technically makes sense here, + * but see comment in push_stack(), hence: + */ + verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br); + if (br) + break; + err = maybe_exit_scc(env, st); + if (err) + return err; + parent = st->parent; + parent_sl = state_parent_as_list(st); + if (sl) + maybe_free_verifier_state(env, sl); + st = parent; + sl = parent_sl; + } + return 0; +} + +/* check %cur's range satisfies %old's */ +static bool range_within(const struct bpf_reg_state *old, + const struct bpf_reg_state *cur) +{ + return old->umin_value <= cur->umin_value && + old->umax_value >= cur->umax_value && + old->smin_value <= cur->smin_value && + old->smax_value >= cur->smax_value && + old->u32_min_value <= cur->u32_min_value && + old->u32_max_value >= cur->u32_max_value && + old->s32_min_value <= cur->s32_min_value && + old->s32_max_value >= cur->s32_max_value; +} + +/* If in the old state two registers had the same id, then they need to have + * the same id in the new state as well. But that id could be different from + * the old state, so we need to track the mapping from old to new ids. + * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent + * regs with old id 5 must also have new id 9 for the new state to be safe. But + * regs with a different old id could still have new id 9, we don't care about + * that. + * So we look through our idmap to see if this old id has been seen before. If + * so, we require the new id to match; otherwise, we add the id pair to the map. + */ +static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) +{ + struct bpf_id_pair *map = idmap->map; + unsigned int i; + + /* either both IDs should be set or both should be zero */ + if (!!old_id != !!cur_id) + return false; + + if (old_id == 0) /* cur_id == 0 as well */ + return true; + + for (i = 0; i < idmap->cnt; i++) { + if (map[i].old == old_id) + return map[i].cur == cur_id; + if (map[i].cur == cur_id) + return false; + } + + /* Reached the end of known mappings; haven't seen this id before */ + if (idmap->cnt < BPF_ID_MAP_SIZE) { + map[idmap->cnt].old = old_id; + map[idmap->cnt].cur = cur_id; + idmap->cnt++; + return true; + } + + /* We ran out of idmap slots, which should be impossible */ + WARN_ON_ONCE(1); + return false; +} + +/* + * Compare scalar register IDs for state equivalence. + * + * When old_id == 0, the old register is independent - not linked to any + * other register. Any linking in the current state only adds constraints, + * making it more restrictive. Since the old state didn't rely on any ID + * relationships for this register, it's always safe to accept cur regardless + * of its ID. Hence, return true immediately. + * + * When old_id != 0 but cur_id == 0, we need to ensure that different + * independent registers in cur don't incorrectly satisfy the ID matching + * requirements of linked registers in old. + * + * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0 + * and r7.id=0 (both independent), without temp IDs both would map old_id=X + * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map + * X->temp2, but X is already mapped to temp1, so the check fails correctly. + * + * When old_id has BPF_ADD_CONST set, the compound id (base | flag) and the + * base id (flag stripped) must both map consistently. Example: old has + * r2.id=A, r3.id=A|flag (r3 = r2 + delta), cur has r2.id=B, r3.id=C|flag + * (r3 derived from unrelated r4). Without the base check, idmap gets two + * independent entries A->B and A|flag->C|flag, missing that A->C conflicts + * with A->B. The base ID cross-check catches this. + */ +static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) +{ + if (!old_id) + return true; + + cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen; + + if (!check_ids(old_id, cur_id, idmap)) + return false; + if (old_id & BPF_ADD_CONST) { + old_id &= ~BPF_ADD_CONST; + cur_id &= ~BPF_ADD_CONST; + if (!check_ids(old_id, cur_id, idmap)) + return false; + } + return true; +} + +static void __clean_func_state(struct bpf_verifier_env *env, + struct bpf_func_state *st, + u16 live_regs, int frame) +{ + int i, j; + + for (i = 0; i < BPF_REG_FP; i++) { + /* liveness must not touch this register anymore */ + if (!(live_regs & BIT(i))) + /* since the register is unused, clear its state + * to make further comparison simpler + */ + bpf_mark_reg_not_init(env, &st->regs[i]); + } + + /* + * Clean dead 4-byte halves within each SPI independently. + * half_spi 2*i → lower half: slot_type[0..3] (closer to FP) + * half_spi 2*i+1 → upper half: slot_type[4..7] (farther from FP) + */ + for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { + bool lo_live = bpf_stack_slot_alive(env, frame, i * 2); + bool hi_live = bpf_stack_slot_alive(env, frame, i * 2 + 1); + + if (!hi_live || !lo_live) { + int start = !lo_live ? 0 : BPF_REG_SIZE / 2; + int end = !hi_live ? BPF_REG_SIZE : BPF_REG_SIZE / 2; + u8 stype = st->stack[i].slot_type[7]; + + /* + * Don't clear special slots. + * destroy_if_dynptr_stack_slot() needs STACK_DYNPTR to + * detect overwrites and invalidate associated data slices. + * is_iter_reg_valid_uninit() and is_irq_flag_reg_valid_uninit() + * check for their respective slot types to detect double-create. + */ + if (stype == STACK_DYNPTR || stype == STACK_ITER || + stype == STACK_IRQ_FLAG) + continue; + + /* + * Only destroy spilled_ptr when hi half is dead. + * If hi half is still live with STACK_SPILL, the + * spilled_ptr metadata is needed for correct state + * comparison in stacksafe(). + * is_spilled_reg() is using slot_type[7], but + * is_spilled_scalar_after() check either slot_type[0] or [4] + */ + if (!hi_live) { + struct bpf_reg_state *spill = &st->stack[i].spilled_ptr; + + if (lo_live && stype == STACK_SPILL) { + u8 val = STACK_MISC; + + /* + * 8 byte spill of scalar 0 where half slot is dead + * should become STACK_ZERO in lo 4 bytes. + */ + if (bpf_register_is_null(spill)) + val = STACK_ZERO; + for (j = 0; j < 4; j++) { + u8 *t = &st->stack[i].slot_type[j]; + + if (*t == STACK_SPILL) + *t = val; + } + } + bpf_mark_reg_not_init(env, spill); + } + for (j = start; j < end; j++) + st->stack[i].slot_type[j] = STACK_POISON; + } + } +} + +static int clean_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + int i, err; + + err = bpf_live_stack_query_init(env, st); + if (err) + return err; + for (i = 0; i <= st->curframe; i++) { + u32 ip = bpf_frame_insn_idx(st, i); + u16 live_regs = env->insn_aux_data[ip].live_regs_before; + + __clean_func_state(env, st->frame[i], live_regs, i); + } + return 0; +} + +static bool regs_exact(const struct bpf_reg_state *rold, + const struct bpf_reg_state *rcur, + struct bpf_idmap *idmap) +{ + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && + check_ids(rold->id, rcur->id, idmap) && + check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); +} + +enum exact_level { + NOT_EXACT, + EXACT, + RANGE_WITHIN +}; + +/* Returns true if (rold safe implies rcur safe) */ +static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, + struct bpf_reg_state *rcur, struct bpf_idmap *idmap, + enum exact_level exact) +{ + if (exact == EXACT) + return regs_exact(rold, rcur, idmap); + + if (rold->type == NOT_INIT) + /* explored state can't have used this */ + return true; + + /* Enforce that register types have to match exactly, including their + * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general + * rule. + * + * One can make a point that using a pointer register as unbounded + * SCALAR would be technically acceptable, but this could lead to + * pointer leaks because scalars are allowed to leak while pointers + * are not. We could make this safe in special cases if root is + * calling us, but it's probably not worth the hassle. + * + * Also, register types that are *not* MAYBE_NULL could technically be + * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE + * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point + * to the same map). + * However, if the old MAYBE_NULL register then got NULL checked, + * doing so could have affected others with the same id, and we can't + * check for that because we lost the id when we converted to + * a non-MAYBE_NULL variant. + * So, as a general rule we don't allow mixing MAYBE_NULL and + * non-MAYBE_NULL registers as well. + */ + if (rold->type != rcur->type) + return false; + + switch (base_type(rold->type)) { + case SCALAR_VALUE: + if (env->explore_alu_limits) { + /* explore_alu_limits disables tnum_in() and range_within() + * logic and requires everything to be strict + */ + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && + check_scalar_ids(rold->id, rcur->id, idmap); + } + if (!rold->precise && exact == NOT_EXACT) + return true; + /* + * Linked register tracking uses rold->id to detect relationships. + * When rold->id == 0, the register is independent and any linking + * in rcur only adds constraints. When rold->id != 0, we must verify + * id mapping and (for BPF_ADD_CONST) offset consistency. + * + * +------------------+-----------+------------------+---------------+ + * | | rold->id | rold + ADD_CONST | rold->id == 0 | + * |------------------+-----------+------------------+---------------| + * | rcur->id | range,ids | false | range | + * | rcur + ADD_CONST | false | range,ids,off | range | + * | rcur->id == 0 | range,ids | false | range | + * +------------------+-----------+------------------+---------------+ + * + * Why check_ids() for scalar registers? + * + * Consider the following BPF code: + * 1: r6 = ... unbound scalar, ID=a ... + * 2: r7 = ... unbound scalar, ID=b ... + * 3: if (r6 > r7) goto +1 + * 4: r6 = r7 + * 5: if (r6 > X) goto ... + * 6: ... memory operation using r7 ... + * + * First verification path is [1-6]: + * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7; + * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark + * r7 <= X, because r6 and r7 share same id. + * Next verification path is [1-4, 6]. + * + * Instruction (6) would be reached in two states: + * I. r6{.id=b}, r7{.id=b} via path 1-6; + * II. r6{.id=a}, r7{.id=b} via path 1-4, 6. + * + * Use check_ids() to distinguish these states. + * --- + * Also verify that new value satisfies old value range knowledge. + */ + + /* + * ADD_CONST flags must match exactly: BPF_ADD_CONST32 and + * BPF_ADD_CONST64 have different linking semantics in + * sync_linked_regs() (alu32 zero-extends, alu64 does not), + * so pruning across different flag types is unsafe. + */ + if (rold->id && + (rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST)) + return false; + + /* Both have offset linkage: offsets must match */ + if ((rold->id & BPF_ADD_CONST) && rold->delta != rcur->delta) + return false; + + if (!check_scalar_ids(rold->id, rcur->id, idmap)) + return false; + + return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_BUF: + case PTR_TO_TP_BUFFER: + /* If the new min/max/var_off satisfy the old ones and + * everything else matches, we are OK. + */ + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && + range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off) && + check_ids(rold->id, rcur->id, idmap) && + check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); + case PTR_TO_PACKET_META: + case PTR_TO_PACKET: + /* We must have at least as much range as the old ptr + * did, so that any accesses which were safe before are + * still safe. This is true even if old range < old off, + * since someone could have accessed through (ptr - k), or + * even done ptr -= k in a register, to get a safe access. + */ + if (rold->range < 0 || rcur->range < 0) { + /* special case for [BEYOND|AT]_PKT_END */ + if (rold->range != rcur->range) + return false; + } else if (rold->range > rcur->range) { + return false; + } + /* id relations must be preserved */ + if (!check_ids(rold->id, rcur->id, idmap)) + return false; + /* new val must satisfy old val knowledge */ + return range_within(rold, rcur) && + tnum_in(rold->var_off, rcur->var_off); + case PTR_TO_STACK: + /* two stack pointers are equal only if they're pointing to + * the same stack frame, since fp-8 in foo != fp-8 in bar + */ + return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; + case PTR_TO_ARENA: + return true; + case PTR_TO_INSN: + return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && + range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); + default: + return regs_exact(rold, rcur, idmap); + } +} + +static struct bpf_reg_state unbound_reg; + +static __init int unbound_reg_init(void) +{ + bpf_mark_reg_unknown_imprecise(&unbound_reg); + return 0; +} +late_initcall(unbound_reg_init); + +static bool is_spilled_scalar_after(const struct bpf_stack_state *stack, int im) +{ + return stack->slot_type[im] == STACK_SPILL && + stack->spilled_ptr.type == SCALAR_VALUE; +} + +static bool is_stack_misc_after(struct bpf_verifier_env *env, + struct bpf_stack_state *stack, int im) +{ + u32 i; + + for (i = im; i < ARRAY_SIZE(stack->slot_type); ++i) { + if ((stack->slot_type[i] == STACK_MISC) || + ((stack->slot_type[i] == STACK_INVALID || stack->slot_type[i] == STACK_POISON) && + env->allow_uninit_stack)) + continue; + return false; + } + + return true; +} + +static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env, + struct bpf_stack_state *stack, int im) +{ + if (is_spilled_scalar_after(stack, im)) + return &stack->spilled_ptr; + + if (is_stack_misc_after(env, stack, im)) + return &unbound_reg; + + return NULL; +} + +static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur, struct bpf_idmap *idmap, + enum exact_level exact) +{ + int i, spi; + + /* walk slots of the explored stack and ignore any additional + * slots in the current stack, since explored(safe) state + * didn't use them + */ + for (i = 0; i < old->allocated_stack; i++) { + struct bpf_reg_state *old_reg, *cur_reg; + int im = i % BPF_REG_SIZE; + + spi = i / BPF_REG_SIZE; + + if (exact == EXACT) { + u8 old_type = old->stack[spi].slot_type[i % BPF_REG_SIZE]; + u8 cur_type = i < cur->allocated_stack ? + cur->stack[spi].slot_type[i % BPF_REG_SIZE] : STACK_INVALID; + + /* STACK_INVALID and STACK_POISON are equivalent for pruning */ + if (old_type == STACK_POISON) + old_type = STACK_INVALID; + if (cur_type == STACK_POISON) + cur_type = STACK_INVALID; + if (i >= cur->allocated_stack || old_type != cur_type) + return false; + } + + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID || + old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_POISON) + continue; + + if (env->allow_uninit_stack && + old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC) + continue; + + /* explored stack has more populated slots than current stack + * and these slots were used + */ + if (i >= cur->allocated_stack) + return false; + + /* + * 64 and 32-bit scalar spills vs MISC/INVALID slots and vice versa. + * Load from MISC/INVALID slots produces unbound scalar. + * Construct a fake register for such stack and call + * regsafe() to ensure scalar ids are compared. + */ + if (im == 0 || im == 4) { + old_reg = scalar_reg_for_stack(env, &old->stack[spi], im); + cur_reg = scalar_reg_for_stack(env, &cur->stack[spi], im); + if (old_reg && cur_reg) { + if (!regsafe(env, old_reg, cur_reg, idmap, exact)) + return false; + i += (im == 0 ? BPF_REG_SIZE - 1 : 3); + continue; + } + } + + /* if old state was safe with misc data in the stack + * it will be safe with zero-initialized stack. + * The opposite is not true + */ + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && + cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) + continue; + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + /* Ex: old explored (safe) state has STACK_SPILL in + * this stack slot, but current has STACK_MISC -> + * this verifier states are not equivalent, + * return false to continue verification of this path + */ + return false; + if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1) + continue; + /* Both old and cur are having same slot_type */ + switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) { + case STACK_SPILL: + /* when explored and current stack slot are both storing + * spilled registers, check that stored pointers types + * are the same as well. + * Ex: explored safe path could have stored + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} + * but current path has stored: + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} + * such verifier states are not equivalent. + * return false to continue verification of this path + */ + if (!regsafe(env, &old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, idmap, exact)) + return false; + break; + case STACK_DYNPTR: + old_reg = &old->stack[spi].spilled_ptr; + cur_reg = &cur->stack[spi].spilled_ptr; + if (old_reg->dynptr.type != cur_reg->dynptr.type || + old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot || + !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + return false; + break; + case STACK_ITER: + old_reg = &old->stack[spi].spilled_ptr; + cur_reg = &cur->stack[spi].spilled_ptr; + /* iter.depth is not compared between states as it + * doesn't matter for correctness and would otherwise + * prevent convergence; we maintain it only to prevent + * infinite loop check triggering, see + * iter_active_depths_differ() + */ + if (old_reg->iter.btf != cur_reg->iter.btf || + old_reg->iter.btf_id != cur_reg->iter.btf_id || + old_reg->iter.state != cur_reg->iter.state || + /* ignore {old_reg,cur_reg}->iter.depth, see above */ + !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + return false; + break; + case STACK_IRQ_FLAG: + old_reg = &old->stack[spi].spilled_ptr; + cur_reg = &cur->stack[spi].spilled_ptr; + if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || + old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) + return false; + break; + case STACK_MISC: + case STACK_ZERO: + case STACK_INVALID: + case STACK_POISON: + continue; + /* Ensure that new unhandled slot types return false by default */ + default: + return false; + } + } + return true; +} + +static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur, + struct bpf_idmap *idmap) +{ + int i; + + if (old->acquired_refs != cur->acquired_refs) + return false; + + if (old->active_locks != cur->active_locks) + return false; + + if (old->active_preempt_locks != cur->active_preempt_locks) + return false; + + if (old->active_rcu_locks != cur->active_rcu_locks) + return false; + + if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) + return false; + + if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) || + old->active_lock_ptr != cur->active_lock_ptr) + return false; + + for (i = 0; i < old->acquired_refs; i++) { + if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || + old->refs[i].type != cur->refs[i].type) + return false; + switch (old->refs[i].type) { + case REF_TYPE_PTR: + case REF_TYPE_IRQ: + break; + case REF_TYPE_LOCK: + case REF_TYPE_RES_LOCK: + case REF_TYPE_RES_LOCK_IRQ: + if (old->refs[i].ptr != cur->refs[i].ptr) + return false; + break; + default: + WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type); + return false; + } + } + + return true; +} + +/* compare two verifier states + * + * all states stored in state_list are known to be valid, since + * verifier reached 'bpf_exit' instruction through them + * + * this function is called when verifier exploring different branches of + * execution popped from the state stack. If it sees an old state that has + * more strict register state and more strict stack state then this execution + * branch doesn't need to be explored further, since verifier already + * concluded that more strict state leads to valid finish. + * + * Therefore two states are equivalent if register state is more conservative + * and explored stack state is more conservative than the current one. + * Example: + * explored current + * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC) + * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC) + * + * In other words if current stack state (one being explored) has more + * valid slots than old one that already passed validation, it means + * the verifier can stop exploring and conclude that current state is valid too + * + * Similarly with registers. If explored state has register type as invalid + * whereas register type in current state is meaningful, it means that + * the current state will reach 'bpf_exit' instruction safely + */ +static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact) +{ + u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before; + u16 i; + + if (old->callback_depth > cur->callback_depth) + return false; + + for (i = 0; i < MAX_BPF_REG; i++) + if (((1 << i) & live_regs) && + !regsafe(env, &old->regs[i], &cur->regs[i], + &env->idmap_scratch, exact)) + return false; + + if (!stacksafe(env, old, cur, &env->idmap_scratch, exact)) + return false; + + return true; +} + +static void reset_idmap_scratch(struct bpf_verifier_env *env) +{ + struct bpf_idmap *idmap = &env->idmap_scratch; + + idmap->tmp_id_gen = env->id_gen; + idmap->cnt = 0; +} + +static bool states_equal(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur, + enum exact_level exact) +{ + u32 insn_idx; + int i; + + if (old->curframe != cur->curframe) + return false; + + reset_idmap_scratch(env); + + /* Verification state from speculative execution simulation + * must never prune a non-speculative execution one. + */ + if (old->speculative && !cur->speculative) + return false; + + if (old->in_sleepable != cur->in_sleepable) + return false; + + if (!refsafe(old, cur, &env->idmap_scratch)) + return false; + + /* for states to be equal callsites have to be the same + * and all frame states need to be equivalent + */ + for (i = 0; i <= old->curframe; i++) { + insn_idx = bpf_frame_insn_idx(old, i); + if (old->frame[i]->callsite != cur->frame[i]->callsite) + return false; + if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact)) + return false; + } + return true; +} + +/* find precise scalars in the previous equivalent state and + * propagate them into the current state + */ +static int propagate_precision(struct bpf_verifier_env *env, + const struct bpf_verifier_state *old, + struct bpf_verifier_state *cur, + bool *changed) +{ + struct bpf_reg_state *state_reg; + struct bpf_func_state *state; + int i, err = 0, fr; + bool first; + + for (fr = old->curframe; fr >= 0; fr--) { + state = old->frame[fr]; + state_reg = state->regs; + first = true; + for (i = 0; i < BPF_REG_FP; i++, state_reg++) { + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) { + if (first) + verbose(env, "frame %d: propagating r%d", fr, i); + else + verbose(env, ",r%d", i); + } + bpf_bt_set_frame_reg(&env->bt, fr, i); + first = false; + } + + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (!bpf_is_spilled_reg(&state->stack[i])) + continue; + state_reg = &state->stack[i].spilled_ptr; + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) { + if (first) + verbose(env, "frame %d: propagating fp%d", + fr, (-i - 1) * BPF_REG_SIZE); + else + verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE); + } + bpf_bt_set_frame_slot(&env->bt, fr, i); + first = false; + } + if (!first && (env->log.level & BPF_LOG_LEVEL2)) + verbose(env, "\n"); + } + + err = bpf_mark_chain_precision(env, cur, -1, changed); + if (err < 0) + return err; + + return 0; +} + +#define MAX_BACKEDGE_ITERS 64 + +/* Propagate read and precision marks from visit->backedges[*].state->equal_state + * to corresponding parent states of visit->backedges[*].state until fixed point is reached, + * then free visit->backedges. + * After execution of this function incomplete_read_marks() will return false + * for all states corresponding to @visit->callchain. + */ +static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit) +{ + struct bpf_scc_backedge *backedge; + struct bpf_verifier_state *st; + bool changed; + int i, err; + + i = 0; + do { + if (i++ > MAX_BACKEDGE_ITERS) { + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "%s: too many iterations\n", __func__); + for (backedge = visit->backedges; backedge; backedge = backedge->next) + bpf_mark_all_scalars_precise(env, &backedge->state); + break; + } + changed = false; + for (backedge = visit->backedges; backedge; backedge = backedge->next) { + st = &backedge->state; + err = propagate_precision(env, st->equal_state, st, &changed); + if (err) + return err; + } + } while (changed); + + bpf_free_backedges(visit); + return 0; +} + +static bool states_maybe_looping(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + struct bpf_func_state *fold, *fcur; + int i, fr = cur->curframe; + + if (old->curframe != fr) + return false; + + fold = old->frame[fr]; + fcur = cur->frame[fr]; + for (i = 0; i < MAX_BPF_REG; i++) + if (memcmp(&fold->regs[i], &fcur->regs[i], + offsetof(struct bpf_reg_state, frameno))) + return false; + return true; +} + +/* is_state_visited() handles iter_next() (see process_iter_next_call() for + * terminology) calls specially: as opposed to bounded BPF loops, it *expects* + * states to match, which otherwise would look like an infinite loop. So while + * iter_next() calls are taken care of, we still need to be careful and + * prevent erroneous and too eager declaration of "infinite loop", when + * iterators are involved. + * + * Here's a situation in pseudo-BPF assembly form: + * + * 0: again: ; set up iter_next() call args + * 1: r1 = &it ; <CHECKPOINT HERE> + * 2: call bpf_iter_num_next ; this is iter_next() call + * 3: if r0 == 0 goto done + * 4: ... something useful here ... + * 5: goto again ; another iteration + * 6: done: + * 7: r1 = &it + * 8: call bpf_iter_num_destroy ; clean up iter state + * 9: exit + * + * This is a typical loop. Let's assume that we have a prune point at 1:, + * before we get to `call bpf_iter_num_next` (e.g., because of that `goto + * again`, assuming other heuristics don't get in a way). + * + * When we first time come to 1:, let's say we have some state X. We proceed + * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit. + * Now we come back to validate that forked ACTIVE state. We proceed through + * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we + * are converging. But the problem is that we don't know that yet, as this + * convergence has to happen at iter_next() call site only. So if nothing is + * done, at 1: verifier will use bounded loop logic and declare infinite + * looping (and would be *technically* correct, if not for iterator's + * "eventual sticky NULL" contract, see process_iter_next_call()). But we + * don't want that. So what we do in process_iter_next_call() when we go on + * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's + * a different iteration. So when we suspect an infinite loop, we additionally + * check if any of the *ACTIVE* iterator states depths differ. If yes, we + * pretend we are not looping and wait for next iter_next() call. + * + * This only applies to ACTIVE state. In DRAINED state we don't expect to + * loop, because that would actually mean infinite loop, as DRAINED state is + * "sticky", and so we'll keep returning into the same instruction with the + * same state (at least in one of possible code paths). + * + * This approach allows to keep infinite loop heuristic even in the face of + * active iterator. E.g., C snippet below is and will be detected as + * infinitely looping: + * + * struct bpf_iter_num it; + * int *p, x; + * + * bpf_iter_num_new(&it, 0, 10); + * while ((p = bpf_iter_num_next(&t))) { + * x = p; + * while (x--) {} // <<-- infinite loop here + * } + * + */ +static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) +{ + struct bpf_reg_state *slot, *cur_slot; + struct bpf_func_state *state; + int i, fr; + + for (fr = old->curframe; fr >= 0; fr--) { + state = old->frame[fr]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_ITER) + continue; + + slot = &state->stack[i].spilled_ptr; + if (slot->iter.state != BPF_ITER_STATE_ACTIVE) + continue; + + cur_slot = &cur->frame[fr]->stack[i].spilled_ptr; + if (cur_slot->iter.depth != slot->iter.depth) + return true; + } + } + return false; +} + +static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + for (i = 0; i <= st->curframe; i++) { + func = st->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = false; + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (!bpf_is_spilled_reg(&func->stack[j])) + continue; + reg = &func->stack[j].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = false; + } + } +} + +int bpf_is_state_visited(struct bpf_verifier_env *env, int insn_idx) +{ + struct bpf_verifier_state_list *new_sl; + struct bpf_verifier_state_list *sl; + struct bpf_verifier_state *cur = env->cur_state, *new; + bool force_new_state, add_new_state, loop; + int n, err, states_cnt = 0; + struct list_head *pos, *tmp, *head; + + force_new_state = env->test_state_freq || bpf_is_force_checkpoint(env, insn_idx) || + /* Avoid accumulating infinitely long jmp history */ + cur->jmp_history_cnt > 40; + + /* bpf progs typically have pruning point every 4 instructions + * http://vger.kernel.org/bpfconf2019.html#session-1 + * Do not add new state for future pruning if the verifier hasn't seen + * at least 2 jumps and at least 8 instructions. + * This heuristics helps decrease 'total_states' and 'peak_states' metric. + * In tests that amounts to up to 50% reduction into total verifier + * memory consumption and 20% verifier time speedup. + */ + add_new_state = force_new_state; + if (env->jmps_processed - env->prev_jmps_processed >= 2 && + env->insn_processed - env->prev_insn_processed >= 8) + add_new_state = true; + + /* keep cleaning the current state as registers/stack become dead */ + err = clean_verifier_state(env, cur); + if (err) + return err; + + loop = false; + head = bpf_explored_state(env, insn_idx); + list_for_each_safe(pos, tmp, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); + states_cnt++; + if (sl->state.insn_idx != insn_idx) + continue; + + if (sl->state.branches) { + struct bpf_func_state *frame = sl->state.frame[sl->state.curframe]; + + if (frame->in_async_callback_fn && + frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) { + /* Different async_entry_cnt means that the verifier is + * processing another entry into async callback. + * Seeing the same state is not an indication of infinite + * loop or infinite recursion. + * But finding the same state doesn't mean that it's safe + * to stop processing the current state. The previous state + * hasn't yet reached bpf_exit, since state.branches > 0. + * Checking in_async_callback_fn alone is not enough either. + * Since the verifier still needs to catch infinite loops + * inside async callbacks. + */ + goto skip_inf_loop_check; + } + /* BPF open-coded iterators loop detection is special. + * states_maybe_looping() logic is too simplistic in detecting + * states that *might* be equivalent, because it doesn't know + * about ID remapping, so don't even perform it. + * See process_iter_next_call() and iter_active_depths_differ() + * for overview of the logic. When current and one of parent + * states are detected as equivalent, it's a good thing: we prove + * convergence and can stop simulating further iterations. + * It's safe to assume that iterator loop will finish, taking into + * account iter_next() contract of eventually returning + * sticky NULL result. + * + * Note, that states have to be compared exactly in this case because + * read and precision marks might not be finalized inside the loop. + * E.g. as in the program below: + * + * 1. r7 = -16 + * 2. r6 = bpf_get_prandom_u32() + * 3. while (bpf_iter_num_next(&fp[-8])) { + * 4. if (r6 != 42) { + * 5. r7 = -32 + * 6. r6 = bpf_get_prandom_u32() + * 7. continue + * 8. } + * 9. r0 = r10 + * 10. r0 += r7 + * 11. r8 = *(u64 *)(r0 + 0) + * 12. r6 = bpf_get_prandom_u32() + * 13. } + * + * Here verifier would first visit path 1-3, create a checkpoint at 3 + * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does + * not have read or precision mark for r7 yet, thus inexact states + * comparison would discard current state with r7=-32 + * => unsafe memory access at 11 would not be caught. + */ + if (is_iter_next_insn(env, insn_idx)) { + if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { + struct bpf_func_state *cur_frame; + struct bpf_reg_state *iter_state, *iter_reg; + int spi; + + cur_frame = cur->frame[cur->curframe]; + /* btf_check_iter_kfuncs() enforces that + * iter state pointer is always the first arg + */ + iter_reg = &cur_frame->regs[BPF_REG_1]; + /* current state is valid due to states_equal(), + * so we can assume valid iter and reg state, + * no need for extra (re-)validations + */ + spi = bpf_get_spi(iter_reg->var_off.value); + iter_state = &bpf_func(env, iter_reg)->stack[spi].spilled_ptr; + if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { + loop = true; + goto hit; + } + } + goto skip_inf_loop_check; + } + if (is_may_goto_insn_at(env, insn_idx)) { + if (sl->state.may_goto_depth != cur->may_goto_depth && + states_equal(env, &sl->state, cur, RANGE_WITHIN)) { + loop = true; + goto hit; + } + } + if (bpf_calls_callback(env, insn_idx)) { + if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { + loop = true; + goto hit; + } + goto skip_inf_loop_check; + } + /* attempt to detect infinite loop to avoid unnecessary doomed work */ + if (states_maybe_looping(&sl->state, cur) && + states_equal(env, &sl->state, cur, EXACT) && + !iter_active_depths_differ(&sl->state, cur) && + sl->state.may_goto_depth == cur->may_goto_depth && + sl->state.callback_unroll_depth == cur->callback_unroll_depth) { + verbose_linfo(env, insn_idx, "; "); + verbose(env, "infinite loop detected at insn %d\n", insn_idx); + verbose(env, "cur state:"); + print_verifier_state(env, cur, cur->curframe, true); + verbose(env, "old state:"); + print_verifier_state(env, &sl->state, cur->curframe, true); + return -EINVAL; + } + /* if the verifier is processing a loop, avoid adding new state + * too often, since different loop iterations have distinct + * states and may not help future pruning. + * This threshold shouldn't be too low to make sure that + * a loop with large bound will be rejected quickly. + * The most abusive loop will be: + * r1 += 1 + * if r1 < 1000000 goto pc-2 + * 1M insn_procssed limit / 100 == 10k peak states. + * This threshold shouldn't be too high either, since states + * at the end of the loop are likely to be useful in pruning. + */ +skip_inf_loop_check: + if (!force_new_state && + env->jmps_processed - env->prev_jmps_processed < 20 && + env->insn_processed - env->prev_insn_processed < 100) + add_new_state = false; + goto miss; + } + /* See comments for mark_all_regs_read_and_precise() */ + loop = incomplete_read_marks(env, &sl->state); + if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { +hit: + sl->hit_cnt++; + + /* if previous state reached the exit with precision and + * current state is equivalent to it (except precision marks) + * the precision needs to be propagated back in + * the current state. + */ + err = 0; + if (bpf_is_jmp_point(env, env->insn_idx)) + err = bpf_push_jmp_history(env, cur, 0, 0); + err = err ? : propagate_precision(env, &sl->state, cur, NULL); + if (err) + return err; + /* When processing iterator based loops above propagate_liveness and + * propagate_precision calls are not sufficient to transfer all relevant + * read and precision marks. E.g. consider the following case: + * + * .-> A --. Assume the states are visited in the order A, B, C. + * | | | Assume that state B reaches a state equivalent to state A. + * | v v At this point, state C is not processed yet, so state A + * '-- B C has not received any read or precision marks from C. + * Thus, marks propagated from A to B are incomplete. + * + * The verifier mitigates this by performing the following steps: + * + * - Prior to the main verification pass, strongly connected components + * (SCCs) are computed over the program's control flow graph, + * intraprocedurally. + * + * - During the main verification pass, `maybe_enter_scc()` checks + * whether the current verifier state is entering an SCC. If so, an + * instance of a `bpf_scc_visit` object is created, and the state + * entering the SCC is recorded as the entry state. + * + * - This instance is associated not with the SCC itself, but with a + * `bpf_scc_callchain`: a tuple consisting of the call sites leading to + * the SCC and the SCC id. See `compute_scc_callchain()`. + * + * - When a verification path encounters a `states_equal(..., + * RANGE_WITHIN)` condition, there exists a call chain describing the + * current state and a corresponding `bpf_scc_visit` instance. A copy + * of the current state is created and added to + * `bpf_scc_visit->backedges`. + * + * - When a verification path terminates, `maybe_exit_scc()` is called + * from `bpf_update_branch_counts()`. For states with `branches == 0`, it + * checks whether the state is the entry state of any `bpf_scc_visit` + * instance. If it is, this indicates that all paths originating from + * this SCC visit have been explored. `propagate_backedges()` is then + * called, which propagates read and precision marks through the + * backedges until a fixed point is reached. + * (In the earlier example, this would propagate marks from A to B, + * from C to A, and then again from A to B.) + * + * A note on callchains + * -------------------- + * + * Consider the following example: + * + * void foo() { loop { ... SCC#1 ... } } + * void main() { + * A: foo(); + * B: ... + * C: foo(); + * } + * + * Here, there are two distinct callchains leading to SCC#1: + * - (A, SCC#1) + * - (C, SCC#1) + * + * Each callchain identifies a separate `bpf_scc_visit` instance that + * accumulates backedge states. The `propagate_{liveness,precision}()` + * functions traverse the parent state of each backedge state, which + * means these parent states must remain valid (i.e., not freed) while + * the corresponding `bpf_scc_visit` instance exists. + * + * Associating `bpf_scc_visit` instances directly with SCCs instead of + * callchains would break this invariant: + * - States explored during `C: foo()` would contribute backedges to + * SCC#1, but SCC#1 would only be exited once the exploration of + * `A: foo()` completes. + * - By that time, the states explored between `A: foo()` and `C: foo()` + * (i.e., `B: ...`) may have already been freed, causing the parent + * links for states from `C: foo()` to become invalid. + */ + if (loop) { + struct bpf_scc_backedge *backedge; + + backedge = kzalloc_obj(*backedge, + GFP_KERNEL_ACCOUNT); + if (!backedge) + return -ENOMEM; + err = bpf_copy_verifier_state(&backedge->state, cur); + backedge->state.equal_state = &sl->state; + backedge->state.insn_idx = insn_idx; + err = err ?: add_scc_backedge(env, &sl->state, backedge); + if (err) { + bpf_free_verifier_state(&backedge->state, false); + kfree(backedge); + return err; + } + } + return 1; + } +miss: + /* when new state is not going to be added do not increase miss count. + * Otherwise several loop iterations will remove the state + * recorded earlier. The goal of these heuristics is to have + * states from some iterations of the loop (some in the beginning + * and some at the end) to help pruning. + */ + if (add_new_state) + sl->miss_cnt++; + /* heuristic to determine whether this state is beneficial + * to keep checking from state equivalence point of view. + * Higher numbers increase max_states_per_insn and verification time, + * but do not meaningfully decrease insn_processed. + * 'n' controls how many times state could miss before eviction. + * Use bigger 'n' for checkpoints because evicting checkpoint states + * too early would hinder iterator convergence. + */ + n = bpf_is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3; + if (sl->miss_cnt > sl->hit_cnt * n + n) { + /* the state is unlikely to be useful. Remove it to + * speed up verification + */ + sl->in_free_list = true; + list_del(&sl->node); + list_add(&sl->node, &env->free_list); + env->free_list_size++; + env->explored_states_size--; + maybe_free_verifier_state(env, sl); + } + } + + if (env->max_states_per_insn < states_cnt) + env->max_states_per_insn = states_cnt; + + if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + return 0; + + if (!add_new_state) + return 0; + + /* There were no equivalent states, remember the current one. + * Technically the current state is not proven to be safe yet, + * but it will either reach outer most bpf_exit (which means it's safe) + * or it will be rejected. When there are no loops the verifier won't be + * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) + * again on the way to bpf_exit. + * When looping the sl->state.branches will be > 0 and this state + * will not be considered for equivalence until branches == 0. + */ + new_sl = kzalloc_obj(struct bpf_verifier_state_list, GFP_KERNEL_ACCOUNT); + if (!new_sl) + return -ENOMEM; + env->total_states++; + env->explored_states_size++; + update_peak_states(env); + env->prev_jmps_processed = env->jmps_processed; + env->prev_insn_processed = env->insn_processed; + + /* forget precise markings we inherited, see __mark_chain_precision */ + if (env->bpf_capable) + mark_all_scalars_imprecise(env, cur); + + bpf_clear_singular_ids(env, cur); + + /* add new state to the head of linked list */ + new = &new_sl->state; + err = bpf_copy_verifier_state(new, cur); + if (err) { + bpf_free_verifier_state(new, false); + kfree(new_sl); + return err; + } + new->insn_idx = insn_idx; + verifier_bug_if(new->branches != 1, env, + "%s:branches_to_explore=%d insn %d", + __func__, new->branches, insn_idx); + err = maybe_enter_scc(env, new); + if (err) { + bpf_free_verifier_state(new, false); + kfree(new_sl); + return err; + } + + cur->parent = new; + cur->first_insn_idx = insn_idx; + cur->dfs_depth = new->dfs_depth + 1; + bpf_clear_jmp_history(cur); + list_add(&new_sl->node, head); + return 0; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0378e83b4099..a3c0214ca934 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -941,14 +941,6 @@ static void bpf_map_free_rcu_gp(struct rcu_head *rcu) bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); } -static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) -{ - if (rcu_trace_implies_rcu_gp()) - bpf_map_free_rcu_gp(rcu); - else - call_rcu(rcu, bpf_map_free_rcu_gp); -} - /* decrement map refcnt and schedule it for freeing via workqueue * (underlying map implementation ops->map_free() might sleep) */ @@ -959,8 +951,9 @@ void bpf_map_put(struct bpf_map *map) bpf_map_free_id(map); WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); + /* RCU tasks trace grace period implies RCU grace period. */ if (READ_ONCE(map->free_after_mult_rcu_gp)) - call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); + call_rcu_tasks_trace(&map->rcu, bpf_map_free_rcu_gp); else if (READ_ONCE(map->free_after_rcu_gp)) call_rcu(&map->rcu, bpf_map_free_rcu_gp); else @@ -1234,7 +1227,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) } EXPORT_SYMBOL_GPL(bpf_obj_name_cpy); -int map_check_no_btf(const struct bpf_map *map, +int map_check_no_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) @@ -2832,7 +2825,7 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr sig = kvmemdup_bpfptr(usig, attr->signature_size); if (IS_ERR(sig)) { bpf_key_put(key); - return -ENOMEM; + return PTR_ERR(sig); } bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0, @@ -3090,10 +3083,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_used_maps; - prog = bpf_prog_select_runtime(prog, &err); - if (err < 0) - goto free_used_maps; - err = bpf_prog_mark_insn_arrays_ready(prog); if (err < 0) goto free_used_maps; @@ -3261,12 +3250,16 @@ static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu) bpf_link_dealloc(link); } -static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu) +static bool bpf_link_is_tracepoint(struct bpf_link *link) { - if (rcu_trace_implies_rcu_gp()) - bpf_link_defer_dealloc_rcu_gp(rcu); - else - call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp); + /* + * Only these combinations support a tracepoint bpf_link. + * BPF_LINK_TYPE_TRACING raw_tp progs are hardcoded to use + * bpf_raw_tp_link_lops and thus dealloc_deferred(), see + * bpf_raw_tp_link_attach(). + */ + return link->type == BPF_LINK_TYPE_RAW_TRACEPOINT || + (link->type == BPF_LINK_TYPE_TRACING && link->attach_type == BPF_TRACE_RAW_TP); } /* bpf_link_free is guaranteed to be called from process context */ @@ -3279,16 +3272,26 @@ static void bpf_link_free(struct bpf_link *link) if (link->prog) ops->release(link); if (ops->dealloc_deferred) { - /* Schedule BPF link deallocation, which will only then + /* + * Schedule BPF link deallocation, which will only then * trigger putting BPF program refcount. * If underlying BPF program is sleepable or BPF link's target * attach hookpoint is sleepable or otherwise requires RCU GPs * to ensure link and its underlying BPF program is not * reachable anymore, we need to first wait for RCU tasks - * trace sync, and then go through "classic" RCU grace period + * trace sync, and then go through "classic" RCU grace period. + * + * For tracepoint BPF links, we need to go through SRCU grace + * period wait instead when non-faultable tracepoint is used. We + * don't need to chain SRCU grace period waits, however, for the + * faultable case, since it exclusively uses RCU Tasks Trace. */ if (link->sleepable || (link->prog && link->prog->sleepable)) - call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp); + /* RCU Tasks Trace grace period implies RCU grace period. */ + call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_rcu_gp); + /* We need to do a SRCU grace period wait for non-faultable tracepoint BPF links. */ + else if (bpf_link_is_tracepoint(link)) + call_tracepoint_unregister_atomic(&link->rcu, bpf_link_defer_dealloc_rcu_gp); else call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp); } else if (ops->dealloc) { @@ -3733,6 +3736,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, tr = prog->aux->dst_trampoline; tgt_prog = prog->aux->dst_prog; } + /* + * It is to prevent modifying struct pt_regs via kprobe_write_ctx=true + * freplace prog. Without this check, kprobe_write_ctx=true freplace + * prog is allowed to attach to kprobe_write_ctx=false kprobe prog, and + * then modify the registers of the kprobe prog's target kernel + * function. + * + * This also blocks the combination of uprobe+freplace, because it is + * unable to recognize the use of the tgt_prog as an uprobe or a kprobe + * by tgt_prog itself. At attach time, uprobe/kprobe is recognized by + * the target perf event flags in __perf_event_set_bpf_prog(). + */ + if (prog->type == BPF_PROG_TYPE_EXT && + prog->aux->kprobe_write_ctx != tgt_prog->aux->kprobe_write_ctx) { + err = -EINVAL; + goto out_unlock; + } err = bpf_link_prime(&link->link.link, &link_primer); if (err) @@ -6348,8 +6368,7 @@ static bool syscall_prog_is_valid_access(int off, int size, { if (off < 0 || off >= U16_MAX) return false; - if (off % size != 0) - return false; + /* No alignment requirements for syscall ctx accesses. */ return true; } diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 98d9b4c0daff..e791ae065c39 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -9,6 +9,8 @@ #include <linux/bpf_mem_alloc.h> #include <linux/btf_ids.h> #include <linux/mm_types.h> +#include <linux/mmap_lock.h> +#include <linux/sched/mm.h> #include "mmap_unlock_work.h" static const char * const iter_task_type_names[] = { @@ -794,11 +796,20 @@ const struct bpf_func_proto bpf_find_vma_proto = { .arg5_type = ARG_ANYTHING, }; +static inline void bpf_iter_mmput_async(struct mm_struct *mm) +{ +#ifdef CONFIG_MMU + mmput_async(mm); +#else + mmput(mm); +#endif +} + struct bpf_iter_task_vma_kern_data { struct task_struct *task; struct mm_struct *mm; - struct mmap_unlock_irq_work *work; - struct vma_iterator vmi; + struct vm_area_struct snapshot; + u64 next_addr; }; struct bpf_iter_task_vma { @@ -819,12 +830,28 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, struct task_struct *task, u64 addr) { struct bpf_iter_task_vma_kern *kit = (void *)it; - bool irq_work_busy = false; int err; BUILD_BUG_ON(sizeof(struct bpf_iter_task_vma_kern) != sizeof(struct bpf_iter_task_vma)); BUILD_BUG_ON(__alignof__(struct bpf_iter_task_vma_kern) != __alignof__(struct bpf_iter_task_vma)); + if (!IS_ENABLED(CONFIG_PER_VMA_LOCK)) { + kit->data = NULL; + return -EOPNOTSUPP; + } + + /* + * Reject irqs-disabled contexts including NMI. Operations used + * by _next() and _destroy() (vma_end_read, fput, bpf_iter_mmput_async) + * can take spinlocks with IRQs disabled (pi_lock, pool->lock). + * Running from NMI or from a tracepoint that fires with those + * locks held could deadlock. + */ + if (irqs_disabled()) { + kit->data = NULL; + return -EBUSY; + } + /* is_iter_reg_valid_uninit guarantees that kit hasn't been initialized * before, so non-NULL kit->data doesn't point to previously * bpf_mem_alloc'd bpf_iter_task_vma_kern_data @@ -834,38 +861,131 @@ __bpf_kfunc int bpf_iter_task_vma_new(struct bpf_iter_task_vma *it, return -ENOMEM; kit->data->task = get_task_struct(task); + /* + * Safely read task->mm and acquire an mm reference. + * + * Cannot use get_task_mm() because its task_lock() is a + * blocking spin_lock that would deadlock if the target task + * already holds alloc_lock on this CPU (e.g. a softirq BPF + * program iterating a task interrupted while holding its + * alloc_lock). + */ + if (!spin_trylock(&task->alloc_lock)) { + err = -EBUSY; + goto err_cleanup_iter; + } kit->data->mm = task->mm; + if (kit->data->mm && !(task->flags & PF_KTHREAD)) + mmget(kit->data->mm); + else + kit->data->mm = NULL; + spin_unlock(&task->alloc_lock); if (!kit->data->mm) { err = -ENOENT; goto err_cleanup_iter; } - /* kit->data->work == NULL is valid after bpf_mmap_unlock_get_irq_work */ - irq_work_busy = bpf_mmap_unlock_get_irq_work(&kit->data->work); - if (irq_work_busy || !mmap_read_trylock(kit->data->mm)) { - err = -EBUSY; - goto err_cleanup_iter; - } - - vma_iter_init(&kit->data->vmi, kit->data->mm, addr); + kit->data->snapshot.vm_file = NULL; + kit->data->next_addr = addr; return 0; err_cleanup_iter: - if (kit->data->task) - put_task_struct(kit->data->task); + put_task_struct(kit->data->task); bpf_mem_free(&bpf_global_ma, kit->data); /* NULL kit->data signals failed bpf_iter_task_vma initialization */ kit->data = NULL; return err; } +/* + * Find and lock the next VMA at or after data->next_addr. + * + * lock_vma_under_rcu() is a point lookup (mas_walk): it finds the VMA + * containing a given address but cannot iterate. An RCU-protected + * maple tree walk with vma_next() (mas_find) is needed first to locate + * the next VMA's vm_start across any gap. + * + * Between the RCU walk and the lock, the VMA may be removed, shrunk, + * or write-locked. On failure, advance past it using vm_end from the + * RCU walk. SLAB_TYPESAFE_BY_RCU can make vm_end stale, so fall back + * to PAGE_SIZE advancement to guarantee forward progress. + */ +static struct vm_area_struct * +bpf_iter_task_vma_find_next(struct bpf_iter_task_vma_kern_data *data) +{ + struct vm_area_struct *vma; + struct vma_iterator vmi; + unsigned long start, end; + +retry: + rcu_read_lock(); + vma_iter_init(&vmi, data->mm, data->next_addr); + vma = vma_next(&vmi); + if (!vma) { + rcu_read_unlock(); + return NULL; + } + start = vma->vm_start; + end = vma->vm_end; + rcu_read_unlock(); + + vma = lock_vma_under_rcu(data->mm, start); + if (!vma) { + if (end <= data->next_addr) + data->next_addr += PAGE_SIZE; + else + data->next_addr = end; + goto retry; + } + + if (unlikely(vma->vm_end <= data->next_addr)) { + data->next_addr += PAGE_SIZE; + vma_end_read(vma); + goto retry; + } + + return vma; +} + +static void bpf_iter_task_vma_snapshot_reset(struct vm_area_struct *snap) +{ + if (snap->vm_file) { + fput(snap->vm_file); + snap->vm_file = NULL; + } +} + __bpf_kfunc struct vm_area_struct *bpf_iter_task_vma_next(struct bpf_iter_task_vma *it) { struct bpf_iter_task_vma_kern *kit = (void *)it; + struct vm_area_struct *snap, *vma; if (!kit->data) /* bpf_iter_task_vma_new failed */ return NULL; - return vma_next(&kit->data->vmi); + + snap = &kit->data->snapshot; + + bpf_iter_task_vma_snapshot_reset(snap); + + vma = bpf_iter_task_vma_find_next(kit->data); + if (!vma) + return NULL; + + memcpy(snap, vma, sizeof(*snap)); + + /* + * The verifier only trusts vm_mm and vm_file (see + * BTF_TYPE_SAFE_TRUSTED_OR_NULL in verifier.c). Take a reference + * on vm_file; vm_mm is already correct because lock_vma_under_rcu() + * verifies vma->vm_mm == mm. All other pointers are untrusted by + * the verifier and left as-is. + */ + if (snap->vm_file) + get_file(snap->vm_file); + + kit->data->next_addr = vma->vm_end; + vma_end_read(vma); + return snap; } __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) @@ -873,8 +993,9 @@ __bpf_kfunc void bpf_iter_task_vma_destroy(struct bpf_iter_task_vma *it) struct bpf_iter_task_vma_kern *kit = (void *)it; if (kit->data) { - bpf_mmap_unlock_mm(kit->data->work, kit->data->mm); + bpf_iter_task_vma_snapshot_reset(&kit->data->snapshot); put_task_struct(kit->data->task); + bpf_iter_mmput_async(kit->data->mm); bpf_mem_free(&bpf_global_ma, kit->data); } } diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 26fbfbb01700..ec9c310cf5d7 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -269,3 +269,51 @@ struct tnum tnum_bswap64(struct tnum a) { return TNUM(swab64(a.value), swab64(a.mask)); } + +/* Given tnum t, and a number z such that tmin <= z < tmax, where tmin + * is the smallest member of the t (= t.value) and tmax is the largest + * member of t (= t.value | t.mask), returns the smallest member of t + * larger than z. + * + * For example, + * t = x11100x0 + * z = 11110001 (241) + * result = 11110010 (242) + * + * Note: if this function is called with z >= tmax, it just returns + * early with tmax; if this function is called with z < tmin, the + * algorithm already returns tmin. + */ +u64 tnum_step(struct tnum t, u64 z) +{ + u64 tmax, d, carry_mask, filled, inc; + + tmax = t.value | t.mask; + + /* if z >= largest member of t, return largest member of t */ + if (z >= tmax) + return tmax; + + /* if z < smallest member of t, return smallest member of t */ + if (z < t.value) + return t.value; + + /* + * Let r be the result tnum member, z = t.value + d. + * Every tnum member is t.value | s for some submask s of t.mask, + * and since t.value & t.mask == 0, t.value | s == t.value + s. + * So r > z becomes s > d where d = z - t.value. + * + * Find the smallest submask s of t.mask greater than d by + * "incrementing d within the mask": fill every non-mask + * position with 1 (`filled`) so +1 ripples through the gaps, + * then keep only mask bits. `carry_mask` additionally fills + * positions below the highest non-mask 1 in d, preventing + * it from trapping the carry. + */ + d = z - t.value; + carry_mask = (1ULL << fls64(d & ~t.mask)) - 1; + filled = d | carry_mask | ~t.mask; + inc = (filled + 1) & t.mask; + return t.value | inc; +} diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 84db9e658e52..f02254a21585 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -1002,10 +1002,8 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, mutex_lock(&tr->mutex); shim_link = cgroup_shim_find(tr, bpf_func); - if (shim_link) { + if (shim_link && !IS_ERR(bpf_link_inc_not_zero(&shim_link->link.link))) { /* Reusing existing shim attached by the other program. */ - bpf_link_inc(&shim_link->link.link); - mutex_unlock(&tr->mutex); bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return 0; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb12ba020649..69d75515ed3f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -195,9 +195,6 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 #define BPF_COMPLEXITY_LIMIT_STATES 64 -#define BPF_MAP_KEY_POISON (1ULL << 63) -#define BPF_MAP_KEY_SEEN (1ULL << 62) - #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512 #define BPF_PRIV_STACK_MIN_SIZE 64 @@ -210,16 +207,10 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); static bool is_trusted_reg(const struct bpf_reg_state *reg); - -static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) -{ - return aux->map_ptr_state.poison; -} - -static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) -{ - return aux->map_ptr_state.unpriv; -} +static inline bool in_sleepable_context(struct bpf_verifier_env *env); +static const char *non_sleepable_context_description(struct bpf_verifier_env *env); +static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg); +static void scalar_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg); static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, struct bpf_map *map, @@ -231,21 +222,6 @@ static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, aux->map_ptr_state.map_ptr = map; } -static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux) -{ - return aux->map_key_state & BPF_MAP_KEY_POISON; -} - -static bool bpf_map_key_unseen(const struct bpf_insn_aux_data *aux) -{ - return !(aux->map_key_state & BPF_MAP_KEY_SEEN); -} - -static u64 bpf_map_key_immediate(const struct bpf_insn_aux_data *aux) -{ - return aux->map_key_state & ~(BPF_MAP_KEY_SEEN | BPF_MAP_KEY_POISON); -} - static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) { bool poisoned = bpf_map_key_poisoned(aux); @@ -254,29 +230,6 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } -static bool bpf_helper_call(const struct bpf_insn *insn) -{ - return insn->code == (BPF_JMP | BPF_CALL) && - insn->src_reg == 0; -} - -static bool bpf_pseudo_call(const struct bpf_insn *insn) -{ - return insn->code == (BPF_JMP | BPF_CALL) && - insn->src_reg == BPF_PSEUDO_CALL; -} - -static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn) -{ - return insn->code == (BPF_JMP | BPF_CALL) && - insn->src_reg == BPF_PSEUDO_KFUNC_CALL; -} - -struct bpf_map_desc { - struct bpf_map *ptr; - int uid; -}; - struct bpf_call_arg_meta { struct bpf_map_desc map; bool raw_mode; @@ -306,59 +259,6 @@ struct bpf_kfunc_meta { s32 id; }; -struct bpf_kfunc_call_arg_meta { - /* In parameters */ - struct btf *btf; - u32 func_id; - u32 kfunc_flags; - const struct btf_type *func_proto; - const char *func_name; - /* Out parameters */ - u32 ref_obj_id; - u8 release_regno; - bool r0_rdonly; - u32 ret_btf_id; - u64 r0_size; - u32 subprogno; - struct { - u64 value; - bool found; - } arg_constant; - - /* arg_{btf,btf_id,owning_ref} are used by kfunc-specific handling, - * generally to pass info about user-defined local kptr types to later - * verification logic - * bpf_obj_drop/bpf_percpu_obj_drop - * Record the local kptr type to be drop'd - * bpf_refcount_acquire (via KF_ARG_PTR_TO_REFCOUNTED_KPTR arg type) - * Record the local kptr type to be refcount_incr'd and use - * arg_owning_ref to determine whether refcount_acquire should be - * fallible - */ - struct btf *arg_btf; - u32 arg_btf_id; - bool arg_owning_ref; - bool arg_prog; - - struct { - struct btf_field *field; - } arg_list_head; - struct { - struct btf_field *field; - } arg_rbtree_root; - struct { - enum bpf_dynptr_type type; - u32 id; - u32 ref_obj_id; - } initialized_dynptr; - struct { - u8 spi; - u8 frameno; - } iter; - struct bpf_map_desc map; - u64 mem_size; -}; - struct btf *btf_vmlinux; static const char *btf_type_name(const struct btf *btf, u32 id) @@ -437,13 +337,36 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) return rec; } -static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) +bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog) { struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux; return aux && aux[subprog].linkage == BTF_FUNC_GLOBAL; } +static bool subprog_returns_void(struct bpf_verifier_env *env, int subprog) +{ + const struct btf_type *type, *func, *func_proto; + const struct btf *btf = env->prog->aux->btf; + u32 btf_id; + + btf_id = env->prog->aux->func_info[subprog].type_id; + + func = btf_type_by_id(btf, btf_id); + if (verifier_bug_if(!func, env, "btf_id %u not found", btf_id)) + return false; + + func_proto = btf_type_by_id(btf, func->type); + if (!func_proto) + return false; + + type = btf_type_skip_modifiers(btf, func_proto->type, NULL); + if (!type) + return false; + + return btf_type_is_void(type); +} + static const char *subprog_name(const struct bpf_verifier_env *env, int subprog) { struct bpf_func_info *info; @@ -455,7 +378,7 @@ static const char *subprog_name(const struct bpf_verifier_env *env, int subprog) return btf_type_name(env->prog->aux->btf, info->type_id); } -static void mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog) +void bpf_mark_subprog_exc_cb(struct bpf_verifier_env *env, int subprog) { struct bpf_subprog_info *info = subprog_info(env, subprog); @@ -543,13 +466,13 @@ static bool is_callback_calling_function(enum bpf_func_id func_id) is_async_callback_calling_function(func_id); } -static bool is_sync_callback_calling_insn(struct bpf_insn *insn) +bool bpf_is_sync_callback_calling_insn(struct bpf_insn *insn) { return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) || (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm)); } -static bool is_async_callback_calling_insn(struct bpf_insn *insn) +bool bpf_is_async_callback_calling_insn(struct bpf_insn *insn) { return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) || (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm)); @@ -570,24 +493,11 @@ static bool is_async_cb_sleepable(struct bpf_verifier_env *env, struct bpf_insn return false; } -static bool is_may_goto_insn(struct bpf_insn *insn) +bool bpf_is_may_goto_insn(struct bpf_insn *insn) { return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; } -static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx) -{ - return is_may_goto_insn(&env->prog->insnsi[insn_idx]); -} - -static bool is_storage_get_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_sk_storage_get || - func_id == BPF_FUNC_inode_storage_get || - func_id == BPF_FUNC_task_storage_get || - func_id == BPF_FUNC_cgrp_storage_get; -} - static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, const struct bpf_map *map) { @@ -603,32 +513,6 @@ static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, return ref_obj_uses > 1; } -static bool is_cmpxchg_insn(const struct bpf_insn *insn) -{ - return BPF_CLASS(insn->code) == BPF_STX && - BPF_MODE(insn->code) == BPF_ATOMIC && - insn->imm == BPF_CMPXCHG; -} - -static bool is_atomic_load_insn(const struct bpf_insn *insn) -{ - return BPF_CLASS(insn->code) == BPF_STX && - BPF_MODE(insn->code) == BPF_ATOMIC && - insn->imm == BPF_LOAD_ACQ; -} - -static int __get_spi(s32 off) -{ - return (-off - 1) / BPF_REG_SIZE; -} - -static struct bpf_func_state *func(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg) -{ - struct bpf_verifier_state *cur = env->cur_state; - - return cur->frame[reg->frameno]; -} static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) { @@ -654,19 +538,19 @@ static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_s return -EINVAL; } - off = reg->off + reg->var_off.value; + off = reg->var_off.value; if (off % BPF_REG_SIZE) { verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off); return -EINVAL; } - spi = __get_spi(off); + spi = bpf_get_spi(off); if (spi + 1 < nr_slots) { verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off); return -EINVAL; } - if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots)) + if (!is_spi_bounds_valid(bpf_func(env, reg), spi, nr_slots)) return -ERANGE; return spi; } @@ -735,8 +619,6 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, bool first_slot, int dynptr_id); -static void __mark_reg_not_init(const struct bpf_verifier_env *env, - struct bpf_reg_state *reg); static void mark_dynptr_stack_regs(struct bpf_verifier_env *env, struct bpf_reg_state *sreg1, @@ -762,7 +644,7 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); enum bpf_dynptr_type type; int spi, i, err; @@ -814,8 +696,6 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } - bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); - return 0; } @@ -828,15 +708,13 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat state->stack[spi - 1].slot_type[i] = STACK_INVALID; } - __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); - __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - - bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); + bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); } static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi, ref_obj_id, i; /* @@ -895,7 +773,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { if (!env->allow_ptr_leaks) - __mark_reg_not_init(env, reg); + bpf_mark_reg_not_init(env, reg); else __mark_reg_unknown(env, reg); } @@ -920,8 +798,27 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, spi = spi + 1; if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - verbose(env, "cannot overwrite referenced dynptr\n"); - return -EINVAL; + int ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id; + int ref_cnt = 0; + + /* + * A referenced dynptr can be overwritten only if there is at + * least one other dynptr sharing the same ref_obj_id, + * ensuring the reference can still be properly released. + */ + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_DYNPTR) + continue; + if (!state->stack[i].spilled_ptr.dynptr.first_slot) + continue; + if (state->stack[i].spilled_ptr.ref_obj_id == ref_obj_id) + ref_cnt++; + } + + if (ref_cnt <= 1) { + verbose(env, "cannot overwrite referenced dynptr\n"); + return -EINVAL; + } } mark_stack_slot_scratched(env, spi); @@ -946,10 +843,8 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, /* Do not release reference state, we are destroying dynptr on stack, * not using some helper to release it. Just reset register. */ - __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); - __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - - bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); + bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); return 0; } @@ -984,7 +879,7 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int i, spi; /* This already represents first slot of initialized bpf_dynptr. @@ -1014,7 +909,7 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_re static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); enum bpf_dynptr_type dynptr_type; int spi; @@ -1044,7 +939,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int insn_idx, struct btf *btf, u32 btf_id, int nr_slots) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi, i, j, id; spi = iter_get_spi(env, reg, nr_slots); @@ -1076,7 +971,6 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, for (j = 0; j < BPF_REG_SIZE; j++) slot->slot_type[j] = STACK_ITER; - bpf_mark_stack_write(env, state->frameno, BIT(spi - i)); mark_stack_slot_scratched(env, spi - i); } @@ -1086,7 +980,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, static int unmark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi, i, j; spi = iter_get_spi(env, reg, nr_slots); @@ -1100,12 +994,11 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, if (i == 0) WARN_ON_ONCE(release_reference(env, st->ref_obj_id)); - __mark_reg_not_init(env, st); + bpf_mark_reg_not_init(env, st); for (j = 0; j < BPF_REG_SIZE; j++) slot->slot_type[j] = STACK_INVALID; - bpf_mark_stack_write(env, state->frameno, BIT(spi - i)); mark_stack_slot_scratched(env, spi - i); } @@ -1115,7 +1008,7 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi, i, j; /* For -ERANGE (i.e. spi not falling into allocated stack slots), we @@ -1142,7 +1035,7 @@ static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env, static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, struct btf *btf, u32 btf_id, int nr_slots) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi, i, j; spi = iter_get_spi(env, reg, nr_slots); @@ -1179,7 +1072,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int insn_idx, int kfunc_class) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); struct bpf_stack_state *slot; struct bpf_reg_state *st; int spi, i, id; @@ -1195,7 +1088,6 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, slot = &state->stack[spi]; st = &slot->spilled_ptr; - bpf_mark_stack_write(env, reg->frameno, BIT(spi)); __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ st->ref_obj_id = id; @@ -1211,7 +1103,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int kfunc_class) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); struct bpf_stack_state *slot; struct bpf_reg_state *st; int spi, i, err; @@ -1249,9 +1141,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r return err; } - __mark_reg_not_init(env, st); - - bpf_mark_stack_write(env, reg->frameno, BIT(spi)); + bpf_mark_reg_not_init(env, st); for (i = 0; i < BPF_REG_SIZE; i++) slot->slot_type[i] = STACK_INVALID; @@ -1262,7 +1152,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); struct bpf_stack_state *slot; int spi, i; @@ -1286,7 +1176,7 @@ static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bp static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); struct bpf_stack_state *slot; struct bpf_reg_state *st; int spi, i; @@ -1324,6 +1214,7 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack) case STACK_IRQ_FLAG: return true; case STACK_INVALID: + case STACK_POISON: case STACK_MISC: case STACK_ZERO: return false; @@ -1336,26 +1227,12 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack) /* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack. */ -static bool is_spilled_reg(const struct bpf_stack_state *stack) -{ - return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL; -} -static bool is_spilled_scalar_reg(const struct bpf_stack_state *stack) -{ - return stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL && - stack->spilled_ptr.type == SCALAR_VALUE; -} - -static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack) -{ - return stack->slot_type[0] == STACK_SPILL && - stack->spilled_ptr.type == SCALAR_VALUE; -} - -/* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which - * case they are equivalent, or it's STACK_ZERO, in which case we preserve - * more precise STACK_ZERO. +/* + * Mark stack slot as STACK_MISC, unless it is already: + * - STACK_INVALID, in which case they are equivalent. + * - STACK_ZERO, in which case we preserve more precise STACK_ZERO. + * - STACK_POISON, which truly forbids access to the slot. * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is * unnecessary as both are considered equivalent when loading data and pruning, @@ -1366,14 +1243,14 @@ static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype) { if (*stype == STACK_ZERO) return; - if (*stype == STACK_INVALID) + if (*stype == STACK_INVALID || *stype == STACK_POISON) return; *stype = STACK_MISC; } static void scrub_spilled_slot(u8 *stype) { - if (*stype != STACK_INVALID) + if (*stype != STACK_INVALID && *stype != STACK_POISON) *stype = STACK_MISC; } @@ -1662,14 +1539,6 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st return NULL; } -static void update_peak_states(struct bpf_verifier_env *env) -{ - u32 cur_states; - - cur_states = env->explored_states_size + env->free_list_size + env->num_backedges; - env->peak_states = max(env->peak_states, cur_states); -} - static void free_func_state(struct bpf_func_state *state) { if (!state) @@ -1678,15 +1547,15 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } -static void clear_jmp_history(struct bpf_verifier_state *state) +void bpf_clear_jmp_history(struct bpf_verifier_state *state) { kfree(state->jmp_history); state->jmp_history = NULL; state->jmp_history_cnt = 0; } -static void free_verifier_state(struct bpf_verifier_state *state, - bool free_self) +void bpf_free_verifier_state(struct bpf_verifier_state *state, + bool free_self) { int i; @@ -1695,42 +1564,11 @@ static void free_verifier_state(struct bpf_verifier_state *state, state->frame[i] = NULL; } kfree(state->refs); - clear_jmp_history(state); + bpf_clear_jmp_history(state); if (free_self) kfree(state); } -/* struct bpf_verifier_state->parent refers to states - * that are in either of env->{expored_states,free_list}. - * In both cases the state is contained in struct bpf_verifier_state_list. - */ -static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st) -{ - if (st->parent) - return container_of(st->parent, struct bpf_verifier_state_list, state); - return NULL; -} - -static bool incomplete_read_marks(struct bpf_verifier_env *env, - struct bpf_verifier_state *st); - -/* A state can be freed if it is no longer referenced: - * - is in the env->free_list; - * - has no children states; - */ -static void maybe_free_verifier_state(struct bpf_verifier_env *env, - struct bpf_verifier_state_list *sl) -{ - if (!sl->in_free_list - || sl->state.branches != 0 - || incomplete_read_marks(env, &sl->state)) - return; - list_del(&sl->node); - free_verifier_state(&sl->state, false); - kfree(sl); - env->free_list_size--; -} - /* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack */ @@ -1741,8 +1579,8 @@ static int copy_func_state(struct bpf_func_state *dst, return copy_stack_state(dst, src); } -static int copy_verifier_state(struct bpf_verifier_state *dst_state, - const struct bpf_verifier_state *src) +int bpf_copy_verifier_state(struct bpf_verifier_state *dst_state, + const struct bpf_verifier_state *src) { struct bpf_func_state *dst; int i, err; @@ -1766,7 +1604,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return err; dst_state->speculative = src->speculative; dst_state->in_sleepable = src->in_sleepable; - dst_state->cleaned = src->cleaned; dst_state->curframe = src->curframe; dst_state->branches = src->branches; dst_state->parent = src->parent; @@ -1796,7 +1633,7 @@ static u32 state_htab_size(struct bpf_verifier_env *env) return env->prog->len; } -static struct list_head *explored_state(struct bpf_verifier_env *env, int idx) +struct list_head *bpf_explored_state(struct bpf_verifier_env *env, int idx) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_func_state *state = cur->frame[cur->curframe]; @@ -1818,266 +1655,19 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta return true; } -/* Return IP for a given frame in a call stack */ -static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame) -{ - return frame == st->curframe - ? st->insn_idx - : st->frame[frame + 1]->callsite; -} - -/* For state @st look for a topmost frame with frame_insn_idx() in some SCC, - * if such frame exists form a corresponding @callchain as an array of - * call sites leading to this frame and SCC id. - * E.g.: - * - * void foo() { A: loop {... SCC#1 ...}; } - * void bar() { B: loop { C: foo(); ... SCC#2 ... } - * D: loop { E: foo(); ... SCC#3 ... } } - * void main() { F: bar(); } - * - * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending - * on @st frame call sites being (F,C,A) or (F,E,A). - */ -static bool compute_scc_callchain(struct bpf_verifier_env *env, - struct bpf_verifier_state *st, - struct bpf_scc_callchain *callchain) -{ - u32 i, scc, insn_idx; - - memset(callchain, 0, sizeof(*callchain)); - for (i = 0; i <= st->curframe; i++) { - insn_idx = frame_insn_idx(st, i); - scc = env->insn_aux_data[insn_idx].scc; - if (scc) { - callchain->scc = scc; - break; - } else if (i < st->curframe) { - callchain->callsites[i] = insn_idx; - } else { - return false; - } - } - return true; -} - -/* Check if bpf_scc_visit instance for @callchain exists. */ -static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env, - struct bpf_scc_callchain *callchain) -{ - struct bpf_scc_info *info = env->scc_info[callchain->scc]; - struct bpf_scc_visit *visits = info->visits; - u32 i; - - if (!info) - return NULL; - for (i = 0; i < info->num_visits; i++) - if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0) - return &visits[i]; - return NULL; -} - -/* Allocate a new bpf_scc_visit instance corresponding to @callchain. - * Allocated instances are alive for a duration of the do_check_common() - * call and are freed by free_states(). - */ -static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env, - struct bpf_scc_callchain *callchain) -{ - struct bpf_scc_visit *visit; - struct bpf_scc_info *info; - u32 scc, num_visits; - u64 new_sz; - - scc = callchain->scc; - info = env->scc_info[scc]; - num_visits = info ? info->num_visits : 0; - new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1); - info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT); - if (!info) - return NULL; - env->scc_info[scc] = info; - info->num_visits = num_visits + 1; - visit = &info->visits[num_visits]; - memset(visit, 0, sizeof(*visit)); - memcpy(&visit->callchain, callchain, sizeof(*callchain)); - return visit; -} - -/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */ -static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain) -{ - char *buf = env->tmp_str_buf; - int i, delta = 0; - - delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "("); - for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) { - if (!callchain->callsites[i]) - break; - delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,", - callchain->callsites[i]); - } - delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc); - return env->tmp_str_buf; -} - -/* If callchain for @st exists (@st is in some SCC), ensure that - * bpf_scc_visit instance for this callchain exists. - * If instance does not exist or is empty, assign visit->entry_state to @st. - */ -static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) -{ - struct bpf_scc_callchain *callchain = &env->callchain_buf; - struct bpf_scc_visit *visit; - - if (!compute_scc_callchain(env, st, callchain)) - return 0; - visit = scc_visit_lookup(env, callchain); - visit = visit ?: scc_visit_alloc(env, callchain); - if (!visit) - return -ENOMEM; - if (!visit->entry_state) { - visit->entry_state = st; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "SCC enter %s\n", format_callchain(env, callchain)); - } - return 0; -} - -static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit); - -/* If callchain for @st exists (@st is in some SCC), make it empty: - * - set visit->entry_state to NULL; - * - flush accumulated backedges. - */ -static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) -{ - struct bpf_scc_callchain *callchain = &env->callchain_buf; - struct bpf_scc_visit *visit; - - if (!compute_scc_callchain(env, st, callchain)) - return 0; - visit = scc_visit_lookup(env, callchain); - if (!visit) { - /* - * If path traversal stops inside an SCC, corresponding bpf_scc_visit - * must exist for non-speculative paths. For non-speculative paths - * traversal stops when: - * a. Verification error is found, maybe_exit_scc() is not called. - * b. Top level BPF_EXIT is reached. Top level BPF_EXIT is not a member - * of any SCC. - * c. A checkpoint is reached and matched. Checkpoints are created by - * is_state_visited(), which calls maybe_enter_scc(), which allocates - * bpf_scc_visit instances for checkpoints within SCCs. - * (c) is the only case that can reach this point. - */ - if (!st->speculative) { - verifier_bug(env, "scc exit: no visit info for call chain %s", - format_callchain(env, callchain)); - return -EFAULT; - } - return 0; - } - if (visit->entry_state != st) - return 0; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "SCC exit %s\n", format_callchain(env, callchain)); - visit->entry_state = NULL; - env->num_backedges -= visit->num_backedges; - visit->num_backedges = 0; - update_peak_states(env); - return propagate_backedges(env, visit); -} - -/* Lookup an bpf_scc_visit instance corresponding to @st callchain - * and add @backedge to visit->backedges. @st callchain must exist. - */ -static int add_scc_backedge(struct bpf_verifier_env *env, - struct bpf_verifier_state *st, - struct bpf_scc_backedge *backedge) -{ - struct bpf_scc_callchain *callchain = &env->callchain_buf; - struct bpf_scc_visit *visit; - - if (!compute_scc_callchain(env, st, callchain)) { - verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d", - st->insn_idx); - return -EFAULT; - } - visit = scc_visit_lookup(env, callchain); - if (!visit) { - verifier_bug(env, "add backedge: no visit info for call chain %s", - format_callchain(env, callchain)); - return -EFAULT; - } - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "SCC backedge %s\n", format_callchain(env, callchain)); - backedge->next = visit->backedges; - visit->backedges = backedge; - visit->num_backedges++; - env->num_backedges++; - update_peak_states(env); - return 0; -} - -/* bpf_reg_state->live marks for registers in a state @st are incomplete, - * if state @st is in some SCC and not all execution paths starting at this - * SCC are fully explored. - */ -static bool incomplete_read_marks(struct bpf_verifier_env *env, - struct bpf_verifier_state *st) -{ - struct bpf_scc_callchain *callchain = &env->callchain_buf; - struct bpf_scc_visit *visit; - if (!compute_scc_callchain(env, st, callchain)) - return false; - visit = scc_visit_lookup(env, callchain); - if (!visit) - return false; - return !!visit->backedges; -} - -static void free_backedges(struct bpf_scc_visit *visit) +void bpf_free_backedges(struct bpf_scc_visit *visit) { struct bpf_scc_backedge *backedge, *next; for (backedge = visit->backedges; backedge; backedge = next) { - free_verifier_state(&backedge->state, false); + bpf_free_verifier_state(&backedge->state, false); next = backedge->next; kfree(backedge); } visit->backedges = NULL; } -static int update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) -{ - struct bpf_verifier_state_list *sl = NULL, *parent_sl; - struct bpf_verifier_state *parent; - int err; - - while (st) { - u32 br = --st->branches; - - /* verifier_bug_if(br > 1, ...) technically makes sense here, - * but see comment in push_stack(), hence: - */ - verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br); - if (br) - break; - err = maybe_exit_scc(env, st); - if (err) - return err; - parent = st->parent; - parent_sl = state_parent_as_list(st); - if (sl) - maybe_free_verifier_state(env, sl); - st = parent; - sl = parent_sl; - } - return 0; -} - static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx, bool pop_log) { @@ -2089,7 +1679,7 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, return -ENOENT; if (cur) { - err = copy_verifier_state(cur, &head->st); + err = bpf_copy_verifier_state(cur, &head->st); if (err) return err; } @@ -2100,7 +1690,7 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, if (prev_insn_idx) *prev_insn_idx = head->prev_insn_idx; elem = head->next; - free_verifier_state(&head->st, false); + bpf_free_verifier_state(&head->st, false); kfree(head); env->head = elem; env->stack_size--; @@ -2137,7 +1727,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem->log_pos = env->log.end_pos; env->head = elem; env->stack_size++; - err = copy_verifier_state(&elem->st, cur); + err = bpf_copy_verifier_state(&elem->st, cur); if (err) return ERR_PTR(-ENOMEM); elem->st.speculative |= speculative; @@ -2161,7 +1751,6 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, return &elem->st; } -#define CALLER_SAVED_REGS 6 static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; @@ -2224,13 +1813,6 @@ static void __mark_reg_const_zero(const struct bpf_verifier_env *env, struct bpf static void mark_reg_known_zero(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { - if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) - __mark_reg_not_init(env, regs + regno); - return; - } __mark_reg_known_zero(regs + regno); } @@ -2281,11 +1863,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno, struct btf_field_graph_root *ds_head) { - __mark_reg_known_zero(®s[regno]); + __mark_reg_known(®s[regno], ds_head->node_offset); regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC; regs[regno].btf = ds_head->btf; regs[regno].btf_id = ds_head->value_btf_id; - regs[regno].off = ds_head->node_offset; } static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) @@ -2316,7 +1897,6 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, */ return reg->type == which && reg->id == 0 && - reg->off == 0 && tnum_equals_const(reg->var_off, 0); } @@ -2379,6 +1959,9 @@ static void __update_reg32_bounds(struct bpf_reg_state *reg) static void __update_reg64_bounds(struct bpf_reg_state *reg) { + u64 tnum_next, tmax; + bool umin_in_tnum; + /* min signed is max(sign bit) | min(other bits) */ reg->smin_value = max_t(s64, reg->smin_value, reg->var_off.value | (reg->var_off.mask & S64_MIN)); @@ -2388,6 +1971,33 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) reg->umin_value = max(reg->umin_value, reg->var_off.value); reg->umax_value = min(reg->umax_value, reg->var_off.value | reg->var_off.mask); + + /* Check if u64 and tnum overlap in a single value */ + tnum_next = tnum_step(reg->var_off, reg->umin_value); + umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value; + tmax = reg->var_off.value | reg->var_off.mask; + if (umin_in_tnum && tnum_next > reg->umax_value) { + /* The u64 range and the tnum only overlap in umin. + * u64: ---[xxxxxx]----- + * tnum: --xx----------x- + */ + ___mark_reg_known(reg, reg->umin_value); + } else if (!umin_in_tnum && tnum_next == tmax) { + /* The u64 range and the tnum only overlap in the maximum value + * represented by the tnum, called tmax. + * u64: ---[xxxxxx]----- + * tnum: xx-----x-------- + */ + ___mark_reg_known(reg, tmax); + } else if (!umin_in_tnum && tnum_next <= reg->umax_value && + tnum_step(reg->var_off, tnum_next) > reg->umax_value) { + /* The u64 range and the tnum only overlap in between umin + * (excluded) and umax. + * u64: ---[xxxxxx]----- + * tnum: xx----x-------x- + */ + ___mark_reg_known(reg, tnum_next); + } } static void __update_reg_bounds(struct bpf_reg_state *reg) @@ -2397,7 +2007,7 @@ static void __update_reg_bounds(struct bpf_reg_state *reg) } /* Uses signed min/max values to inform unsigned, and vice-versa */ -static void __reg32_deduce_bounds(struct bpf_reg_state *reg) +static void deduce_bounds_32_from_64(struct bpf_reg_state *reg) { /* If upper 32 bits of u64/s64 range don't change, we can use lower 32 * bits to improve our u32/s32 boundaries. @@ -2467,6 +2077,10 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg) reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); } +} + +static void deduce_bounds_32_from_32(struct bpf_reg_state *reg) +{ /* if u32 range forms a valid s32 range (due to matching sign bit), * try to learn from that */ @@ -2481,10 +2095,34 @@ static void __reg32_deduce_bounds(struct bpf_reg_state *reg) if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value); reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value); + } else { + if (reg->u32_max_value < (u32)reg->s32_min_value) { + /* See __reg64_deduce_bounds() for detailed explanation. + * Refine ranges in the following situation: + * + * 0 U32_MAX + * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxx s32 range xxxxxxxxx] [xxxxxxx| + * 0 S32_MAX S32_MIN -1 + */ + reg->s32_min_value = (s32)reg->u32_min_value; + reg->u32_max_value = min_t(u32, reg->u32_max_value, reg->s32_max_value); + } else if ((u32)reg->s32_max_value < reg->u32_min_value) { + /* + * 0 U32_MAX + * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxxxxxx] [xxxxxxxxxxxx s32 range | + * 0 S32_MAX S32_MIN -1 + */ + reg->s32_max_value = (s32)reg->u32_max_value; + reg->u32_min_value = max_t(u32, reg->u32_min_value, reg->s32_min_value); + } } } -static void __reg64_deduce_bounds(struct bpf_reg_state *reg) +static void deduce_bounds_64_from_64(struct bpf_reg_state *reg) { /* If u64 range forms a valid s64 range (due to matching sign bit), * try to learn from that. Let's do a bit of ASCII art to see when @@ -2619,7 +2257,7 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) } } -static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) +static void deduce_bounds_64_from_32(struct bpf_reg_state *reg) { /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit * values on both sides of 64-bit range in hope to have tighter range. @@ -2688,9 +2326,10 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) static void __reg_deduce_bounds(struct bpf_reg_state *reg) { - __reg32_deduce_bounds(reg); - __reg64_deduce_bounds(reg); - __reg_deduce_mixed_bounds(reg); + deduce_bounds_64_from_64(reg); + deduce_bounds_32_from_64(reg); + deduce_bounds_32_from_32(reg); + deduce_bounds_64_from_32(reg); } /* Attempts to improve var_off based on unsigned min/max information */ @@ -2706,14 +2345,18 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } +static bool range_bounds_violation(struct bpf_reg_state *reg); + static void reg_bounds_sync(struct bpf_reg_state *reg) { + /* If the input reg_state is invalid, we can exit early */ + if (range_bounds_violation(reg)) + return; /* We might have learned new bounds from the var_off. */ __update_reg_bounds(reg); /* We might have learned something about the sign bit. */ __reg_deduce_bounds(reg); __reg_deduce_bounds(reg); - __reg_deduce_bounds(reg); /* We might have learned some bits from the bounds. */ __reg_bound_offset(reg); /* Intersecting with the old var_off might have improved our bounds @@ -2723,39 +2366,55 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) __update_reg_bounds(reg); } +static bool range_bounds_violation(struct bpf_reg_state *reg) +{ + return (reg->umin_value > reg->umax_value || reg->smin_value > reg->smax_value || + reg->u32_min_value > reg->u32_max_value || + reg->s32_min_value > reg->s32_max_value); +} + +static bool const_tnum_range_mismatch(struct bpf_reg_state *reg) +{ + u64 uval = reg->var_off.value; + s64 sval = (s64)uval; + + if (!tnum_is_const(reg->var_off)) + return false; + + return reg->umin_value != uval || reg->umax_value != uval || + reg->smin_value != sval || reg->smax_value != sval; +} + +static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg) +{ + u32 uval32 = tnum_subreg(reg->var_off).value; + s32 sval32 = (s32)uval32; + + if (!tnum_subreg_is_const(reg->var_off)) + return false; + + return reg->u32_min_value != uval32 || reg->u32_max_value != uval32 || + reg->s32_min_value != sval32 || reg->s32_max_value != sval32; +} + static int reg_bounds_sanity_check(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *ctx) { const char *msg; - if (reg->umin_value > reg->umax_value || - reg->smin_value > reg->smax_value || - reg->u32_min_value > reg->u32_max_value || - reg->s32_min_value > reg->s32_max_value) { - msg = "range bounds violation"; - goto out; + if (range_bounds_violation(reg)) { + msg = "range bounds violation"; + goto out; } - if (tnum_is_const(reg->var_off)) { - u64 uval = reg->var_off.value; - s64 sval = (s64)uval; - - if (reg->umin_value != uval || reg->umax_value != uval || - reg->smin_value != sval || reg->smax_value != sval) { - msg = "const tnum out of sync with range bounds"; - goto out; - } + if (const_tnum_range_mismatch(reg)) { + msg = "const tnum out of sync with range bounds"; + goto out; } - if (tnum_subreg_is_const(reg->var_off)) { - u32 uval32 = tnum_subreg(reg->var_off).value; - s32 sval32 = (s32)uval32; - - if (reg->u32_min_value != uval32 || reg->u32_max_value != uval32 || - reg->s32_min_value != sval32 || reg->s32_max_value != sval32) { - msg = "const subreg tnum out of sync with range bounds"; - goto out; - } + if (const_tnum_range_mismatch_32(reg)) { + msg = "const subreg tnum out of sync with range bounds"; + goto out; } return 0; @@ -2798,7 +2457,7 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg) } /* Mark a register as having a completely unknown (scalar) value. */ -static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg) +void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg) { /* * Clear type, off, and union(map_ptr, range) and @@ -2820,20 +2479,13 @@ static void __mark_reg_unknown_imprecise(struct bpf_reg_state *reg) static void __mark_reg_unknown(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - __mark_reg_unknown_imprecise(reg); + bpf_mark_reg_unknown_imprecise(reg); reg->precise = !env->bpf_capable; } static void mark_reg_unknown(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { - if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose(env, "mark_reg_unknown(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs except FP */ - for (regno = 0; regno < BPF_REG_FP; regno++) - __mark_reg_not_init(env, regs + regno); - return; - } __mark_reg_unknown(env, regs + regno); } @@ -2856,26 +2508,13 @@ static int __mark_reg_s32_range(struct bpf_verifier_env *env, return reg_bounds_sanity_check(env, reg, "s32_range"); } -static void __mark_reg_not_init(const struct bpf_verifier_env *env, - struct bpf_reg_state *reg) +void bpf_mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg) { __mark_reg_unknown(env, reg); reg->type = NOT_INIT; } -static void mark_reg_not_init(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, u32 regno) -{ - if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose(env, "mark_reg_not_init(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs except FP */ - for (regno = 0; regno < BPF_REG_FP; regno++) - __mark_reg_not_init(env, regs + regno); - return; - } - __mark_reg_not_init(env, regs + regno); -} - static int mark_btf_ld_reg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, enum bpf_reg_type reg_type, @@ -2913,7 +2552,7 @@ static void init_reg_state(struct bpf_verifier_env *env, int i; for (i = 0; i < MAX_BPF_REG; i++) { - mark_reg_not_init(env, regs, i); + bpf_mark_reg_not_init(env, ®s[i]); regs[i].subreg_def = DEF_NOT_SUBREG; } @@ -2925,10 +2564,13 @@ static void init_reg_state(struct bpf_verifier_env *env, static struct bpf_retval_range retval_range(s32 minval, s32 maxval) { - return (struct bpf_retval_range){ minval, maxval }; + /* + * return_32bit is set to false by default and set explicitly + * by the caller when necessary. + */ + return (struct bpf_retval_range){ minval, maxval, false }; } -#define BPF_MAIN_FUNC (-1) static void init_func_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int callsite, int frameno, int subprogno) @@ -2965,7 +2607,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, env->stack_size); return ERR_PTR(-E2BIG); } - /* Unlike push_stack() do not copy_verifier_state(). + /* Unlike push_stack() do not bpf_copy_verifier_state(). * The caller state doesn't matter. * This is async callback. It starts in a fresh stack. * Initialize it similar to do_check_common(). @@ -2984,12 +2626,6 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, } -enum reg_arg_type { - SRC_OP, /* register is used as source operand */ - DST_OP, /* register is used as destination operand */ - DST_OP_NO_MARK /* same as above, check only, don't mark */ -}; - static int cmp_subprogs(const void *a, const void *b) { return ((struct bpf_subprog_info *)a)->start - @@ -3018,7 +2654,7 @@ struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *en } /* Find subprogram that starts exactly at 'off' */ -static int find_subprog(struct bpf_verifier_env *env, int off) +int bpf_find_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *p; @@ -3037,7 +2673,7 @@ static int add_subprog(struct bpf_verifier_env *env, int off) verbose(env, "call to invalid destination\n"); return -EINVAL; } - ret = find_subprog(env, off); + ret = bpf_find_subprog(env, off); if (ret >= 0) return ret; if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { @@ -3113,41 +2749,19 @@ static int bpf_find_exception_callback_insn_off(struct bpf_verifier_env *env) return ret; } -#define MAX_KFUNC_DESCS 256 #define MAX_KFUNC_BTFS 256 -struct bpf_kfunc_desc { - struct btf_func_model func_model; - u32 func_id; - s32 imm; - u16 offset; - unsigned long addr; -}; - struct bpf_kfunc_btf { struct btf *btf; struct module *module; u16 offset; }; -struct bpf_kfunc_desc_tab { - /* Sorted by func_id (BTF ID) and offset (fd_array offset) during - * verification. JITs do lookups by bpf_insn, where func_id may not be - * available, therefore at the end of verification do_misc_fixups() - * sorts this by imm and offset. - */ - struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS]; - u32 nr_descs; -}; - struct bpf_kfunc_btf_tab { struct bpf_kfunc_btf descs[MAX_KFUNC_BTFS]; u32 nr_descs; }; -static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, - int insn_idx); - static int kfunc_desc_cmp_by_id_off(const void *a, const void *b) { const struct bpf_kfunc_desc *d0 = a; @@ -3375,7 +2989,7 @@ static int fetch_kfunc_meta(struct bpf_verifier_env *env, return 0; } -static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) +int bpf_add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, u16 offset) { struct bpf_kfunc_btf_tab *btf_tab; struct btf_func_model func_model; @@ -3470,95 +3084,11 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return 0; } -static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b) -{ - const struct bpf_kfunc_desc *d0 = a; - const struct bpf_kfunc_desc *d1 = b; - - if (d0->imm != d1->imm) - return d0->imm < d1->imm ? -1 : 1; - if (d0->offset != d1->offset) - return d0->offset < d1->offset ? -1 : 1; - return 0; -} - -static int set_kfunc_desc_imm(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc) -{ - unsigned long call_imm; - - if (bpf_jit_supports_far_kfunc_call()) { - call_imm = desc->func_id; - } else { - call_imm = BPF_CALL_IMM(desc->addr); - /* Check whether the relative offset overflows desc->imm */ - if ((unsigned long)(s32)call_imm != call_imm) { - verbose(env, "address of kernel func_id %u is out of range\n", - desc->func_id); - return -EINVAL; - } - } - desc->imm = call_imm; - return 0; -} - -static int sort_kfunc_descs_by_imm_off(struct bpf_verifier_env *env) -{ - struct bpf_kfunc_desc_tab *tab; - int i, err; - - tab = env->prog->aux->kfunc_tab; - if (!tab) - return 0; - - for (i = 0; i < tab->nr_descs; i++) { - err = set_kfunc_desc_imm(env, &tab->descs[i]); - if (err) - return err; - } - - sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), - kfunc_desc_cmp_by_imm_off, NULL); - return 0; -} - bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) { return !!prog->aux->kfunc_tab; } -const struct btf_func_model * -bpf_jit_find_kfunc_model(const struct bpf_prog *prog, - const struct bpf_insn *insn) -{ - const struct bpf_kfunc_desc desc = { - .imm = insn->imm, - .offset = insn->off, - }; - const struct bpf_kfunc_desc *res; - struct bpf_kfunc_desc_tab *tab; - - tab = prog->aux->kfunc_tab; - res = bsearch(&desc, tab->descs, tab->nr_descs, - sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off); - - return res ? &res->func_model : NULL; -} - -static int add_kfunc_in_insns(struct bpf_verifier_env *env, - struct bpf_insn *insn, int cnt) -{ - int i, ret; - - for (i = 0; i < cnt; i++, insn++) { - if (bpf_pseudo_kfunc_call(insn)) { - ret = add_kfunc_call(env, insn->imm, insn->off); - if (ret < 0) - return ret; - } - } - return 0; -} - static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; @@ -3583,7 +3113,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn)) ret = add_subprog(env, i + insn->imm + 1); else - ret = add_kfunc_call(env, insn->imm, insn->off); + ret = bpf_add_kfunc_call(env, insn->imm, insn->off); if (ret < 0) return ret; @@ -3605,7 +3135,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) if (env->subprog_info[i].start != ex_cb_insn) continue; env->exception_callback_subprog = i; - mark_subprog_exc_cb(env, i); + bpf_mark_subprog_exc_cb(env, i); break; } } @@ -3678,17 +3208,101 @@ next: return 0; } +/* + * Sort subprogs in topological order so that leaf subprogs come first and + * their callers come later. This is a DFS post-order traversal of the call + * graph. Scan only reachable instructions (those in the computed postorder) of + * the current subprog to discover callees (direct subprogs and sync + * callbacks). + */ +static int sort_subprogs_topo(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *si = env->subprog_info; + int *insn_postorder = env->cfg.insn_postorder; + struct bpf_insn *insn = env->prog->insnsi; + int cnt = env->subprog_cnt; + int *dfs_stack = NULL; + int top = 0, order = 0; + int i, ret = 0; + u8 *color = NULL; + + color = kvzalloc_objs(*color, cnt, GFP_KERNEL_ACCOUNT); + dfs_stack = kvmalloc_objs(*dfs_stack, cnt, GFP_KERNEL_ACCOUNT); + if (!color || !dfs_stack) { + ret = -ENOMEM; + goto out; + } + + /* + * DFS post-order traversal. + * Color values: 0 = unvisited, 1 = on stack, 2 = done. + */ + for (i = 0; i < cnt; i++) { + if (color[i]) + continue; + color[i] = 1; + dfs_stack[top++] = i; + + while (top > 0) { + int cur = dfs_stack[top - 1]; + int po_start = si[cur].postorder_start; + int po_end = si[cur + 1].postorder_start; + bool pushed = false; + int j; + + for (j = po_start; j < po_end; j++) { + int idx = insn_postorder[j]; + int callee; + + if (!bpf_pseudo_call(&insn[idx]) && !bpf_pseudo_func(&insn[idx])) + continue; + callee = bpf_find_subprog(env, idx + insn[idx].imm + 1); + if (callee < 0) { + ret = -EFAULT; + goto out; + } + if (color[callee] == 2) + continue; + if (color[callee] == 1) { + if (bpf_pseudo_func(&insn[idx])) + continue; + verbose(env, "recursive call from %s() to %s()\n", + subprog_name(env, cur), + subprog_name(env, callee)); + ret = -EINVAL; + goto out; + } + color[callee] = 1; + dfs_stack[top++] = callee; + pushed = true; + break; + } + + if (!pushed) { + color[cur] = 2; + env->subprog_topo_order[order++] = cur; + top--; + } + } + } + + if (env->log.level & BPF_LOG_LEVEL2) + for (i = 0; i < cnt; i++) + verbose(env, "topo_order[%d] = %s\n", + i, subprog_name(env, env->subprog_topo_order[i])); +out: + kvfree(dfs_stack); + kvfree(color); + return ret; +} + static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi, int nr_slots) { - int err, i; + int i; - for (i = 0; i < nr_slots; i++) { - err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i)); - if (err) - return err; + for (i = 0; i < nr_slots; i++) mark_stack_slot_scratched(env, spi - i); - } return 0; } @@ -3732,8 +3346,8 @@ static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state * code only. It returns TRUE if the source or destination register operates * on 64-bit, otherwise return FALSE. */ -static bool is_reg64(struct bpf_insn *insn, - u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) +bool bpf_is_reg64(struct bpf_insn *insn, + u32 regno, struct bpf_reg_state *reg, enum bpf_reg_arg_type t) { u8 code, class, op; @@ -3818,41 +3432,6 @@ static bool is_reg64(struct bpf_insn *insn, return true; } -/* Return the regno defined by the insn, or -1. */ -static int insn_def_regno(const struct bpf_insn *insn) -{ - switch (BPF_CLASS(insn->code)) { - case BPF_JMP: - case BPF_JMP32: - case BPF_ST: - return -1; - case BPF_STX: - if (BPF_MODE(insn->code) == BPF_ATOMIC || - BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) { - if (insn->imm == BPF_CMPXCHG) - return BPF_REG_0; - else if (insn->imm == BPF_LOAD_ACQ) - return insn->dst_reg; - else if (insn->imm & BPF_FETCH) - return insn->src_reg; - } - return -1; - default: - return insn->dst_reg; - } -} - -/* Return TRUE if INSN has defined any 32-bit value explicitly. */ -static bool insn_has_def32(struct bpf_insn *insn) -{ - int dst_reg = insn_def_regno(insn); - - if (dst_reg == -1) - return false; - - return !is_reg64(insn, dst_reg, NULL, DST_OP); -} - static void mark_insn_zext(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { @@ -3867,21 +3446,16 @@ static void mark_insn_zext(struct bpf_verifier_env *env, } static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, - enum reg_arg_type t) + enum bpf_reg_arg_type t) { struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; struct bpf_reg_state *reg; bool rw64; - if (regno >= MAX_BPF_REG) { - verbose(env, "R%d is invalid\n", regno); - return -EINVAL; - } - mark_reg_scratched(env, regno); reg = ®s[regno]; - rw64 = is_reg64(insn, regno, reg, t); + rw64 = bpf_is_reg64(insn, regno, reg, t); if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (reg->type == NOT_INIT) { @@ -3910,7 +3484,7 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r } static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, - enum reg_arg_type t) + enum bpf_reg_arg_type t) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; @@ -3923,24 +3497,9 @@ static int insn_stack_access_flags(int frameno, int spi) return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno; } -static int insn_stack_access_spi(int insn_flags) -{ - return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK; -} - -static int insn_stack_access_frameno(int insn_flags) -{ - return insn_flags & INSN_F_FRAMENO_MASK; -} - -static void mark_jmp_point(struct bpf_verifier_env *env, int idx) -{ - env->insn_aux_data[idx].jmp_point = true; -} - -static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) +static void mark_indirect_target(struct bpf_verifier_env *env, int idx) { - return env->insn_aux_data[insn_idx].jmp_point; + env->insn_aux_data[idx].indirect_target = true; } #define LR_FRAMENO_BITS 3 @@ -4021,91 +3580,6 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s) } } -/* for any branch, call, exit record the history of jmps in the given state */ -static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) -{ - u32 cnt = cur->jmp_history_cnt; - struct bpf_jmp_history_entry *p; - size_t alloc_size; - - /* combine instruction flags if we already recorded this instruction */ - if (env->cur_hist_ent) { - /* atomic instructions push insn_flags twice, for READ and - * WRITE sides, but they should agree on stack slot - */ - verifier_bug_if((env->cur_hist_ent->flags & insn_flags) && - (env->cur_hist_ent->flags & insn_flags) != insn_flags, - env, "insn history: insn_idx %d cur flags %x new flags %x", - env->insn_idx, env->cur_hist_ent->flags, insn_flags); - env->cur_hist_ent->flags |= insn_flags; - verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env, - "insn history: insn_idx %d linked_regs: %#llx", - env->insn_idx, env->cur_hist_ent->linked_regs); - env->cur_hist_ent->linked_regs = linked_regs; - return 0; - } - - cnt++; - alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); - p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT); - if (!p) - return -ENOMEM; - cur->jmp_history = p; - - p = &cur->jmp_history[cnt - 1]; - p->idx = env->insn_idx; - p->prev_idx = env->prev_insn_idx; - p->flags = insn_flags; - p->linked_regs = linked_regs; - cur->jmp_history_cnt = cnt; - env->cur_hist_ent = p; - - return 0; -} - -static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, - u32 hist_end, int insn_idx) -{ - if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) - return &st->jmp_history[hist_end - 1]; - return NULL; -} - -/* Backtrack one insn at a time. If idx is not at the top of recorded - * history then previous instruction came from straight line execution. - * Return -ENOENT if we exhausted all instructions within given state. - * - * It's legal to have a bit of a looping with the same starting and ending - * insn index within the same state, e.g.: 3->4->5->3, so just because current - * instruction index is the same as state's first_idx doesn't mean we are - * done. If there is still some jump history left, we should keep going. We - * need to take into account that we might have a jump history between given - * state's parent and itself, due to checkpointing. In this case, we'll have - * history entry recording a jump from last instruction of parent state and - * first instruction of given state. - */ -static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, - u32 *history) -{ - u32 cnt = *history; - - if (i == st->first_insn_idx) { - if (cnt == 0) - return -ENOENT; - if (cnt == 1 && st->jmp_history[0].idx == i) - return -ENOENT; - } - - if (cnt && st->jmp_history[cnt - 1].idx == i) { - i = st->jmp_history[cnt - 1].prev_idx; - (*history)--; - } else { - i--; - } - return i; -} - static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) { const struct btf_type *func; @@ -4122,7 +3596,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) return btf_name_by_offset(desc_btf, func->name_off); } -static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn) +void bpf_verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn) { const struct bpf_insn_cbs cbs = { .cb_call = disasm_kfunc_name, @@ -4133,158 +3607,10 @@ static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn) print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } -static inline void bt_init(struct backtrack_state *bt, u32 frame) -{ - bt->frame = frame; -} - -static inline void bt_reset(struct backtrack_state *bt) -{ - struct bpf_verifier_env *env = bt->env; - - memset(bt, 0, sizeof(*bt)); - bt->env = env; -} - -static inline u32 bt_empty(struct backtrack_state *bt) -{ - u64 mask = 0; - int i; - - for (i = 0; i <= bt->frame; i++) - mask |= bt->reg_masks[i] | bt->stack_masks[i]; - - return mask == 0; -} - -static inline int bt_subprog_enter(struct backtrack_state *bt) -{ - if (bt->frame == MAX_CALL_FRAMES - 1) { - verifier_bug(bt->env, "subprog enter from frame %d", bt->frame); - return -EFAULT; - } - bt->frame++; - return 0; -} - -static inline int bt_subprog_exit(struct backtrack_state *bt) -{ - if (bt->frame == 0) { - verifier_bug(bt->env, "subprog exit from frame 0"); - return -EFAULT; - } - bt->frame--; - return 0; -} - -static inline void bt_set_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg) -{ - bt->reg_masks[frame] |= 1 << reg; -} - -static inline void bt_clear_frame_reg(struct backtrack_state *bt, u32 frame, u32 reg) -{ - bt->reg_masks[frame] &= ~(1 << reg); -} - -static inline void bt_set_reg(struct backtrack_state *bt, u32 reg) -{ - bt_set_frame_reg(bt, bt->frame, reg); -} - -static inline void bt_clear_reg(struct backtrack_state *bt, u32 reg) -{ - bt_clear_frame_reg(bt, bt->frame, reg); -} - -static inline void bt_set_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot) -{ - bt->stack_masks[frame] |= 1ull << slot; -} - -static inline void bt_clear_frame_slot(struct backtrack_state *bt, u32 frame, u32 slot) -{ - bt->stack_masks[frame] &= ~(1ull << slot); -} - -static inline u32 bt_frame_reg_mask(struct backtrack_state *bt, u32 frame) -{ - return bt->reg_masks[frame]; -} - -static inline u32 bt_reg_mask(struct backtrack_state *bt) -{ - return bt->reg_masks[bt->frame]; -} - -static inline u64 bt_frame_stack_mask(struct backtrack_state *bt, u32 frame) -{ - return bt->stack_masks[frame]; -} - -static inline u64 bt_stack_mask(struct backtrack_state *bt) -{ - return bt->stack_masks[bt->frame]; -} - -static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) -{ - return bt->reg_masks[bt->frame] & (1 << reg); -} - -static inline bool bt_is_frame_reg_set(struct backtrack_state *bt, u32 frame, u32 reg) -{ - return bt->reg_masks[frame] & (1 << reg); -} - -static inline bool bt_is_frame_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) -{ - return bt->stack_masks[frame] & (1ull << slot); -} - -/* format registers bitmask, e.g., "r0,r2,r4" for 0x15 mask */ -static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask) -{ - DECLARE_BITMAP(mask, 64); - bool first = true; - int i, n; - - buf[0] = '\0'; - - bitmap_from_u64(mask, reg_mask); - for_each_set_bit(i, mask, 32) { - n = snprintf(buf, buf_sz, "%sr%d", first ? "" : ",", i); - first = false; - buf += n; - buf_sz -= n; - if (buf_sz < 0) - break; - } -} -/* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */ -void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) -{ - DECLARE_BITMAP(mask, 64); - bool first = true; - int i, n; - - buf[0] = '\0'; - - bitmap_from_u64(mask, stack_mask); - for_each_set_bit(i, mask, 64) { - n = snprintf(buf, buf_sz, "%s%d", first ? "" : ",", -(i + 1) * 8); - first = false; - buf += n; - buf_sz -= n; - if (buf_sz < 0) - break; - } -} - /* If any register R in hist->linked_regs is marked as precise in bt, * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. */ -static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) +void bpf_bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) { struct linked_regs linked_regs; bool some_precise = false; @@ -4311,713 +3637,15 @@ static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_histo struct linked_reg *e = &linked_regs.entries[i]; if (e->is_reg) - bt_set_frame_reg(bt, e->frameno, e->regno); + bpf_bt_set_frame_reg(bt, e->frameno, e->regno); else - bt_set_frame_slot(bt, e->frameno, e->spi); - } -} - -/* For given verifier state backtrack_insn() is called from the last insn to - * the first insn. Its purpose is to compute a bitmask of registers and - * stack slots that needs precision in the parent verifier state. - * - * @idx is an index of the instruction we are currently processing; - * @subseq_idx is an index of the subsequent instruction that: - * - *would be* executed next, if jump history is viewed in forward order; - * - *was* processed previously during backtracking. - */ -static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) -{ - struct bpf_insn *insn = env->prog->insnsi + idx; - u8 class = BPF_CLASS(insn->code); - u8 opcode = BPF_OP(insn->code); - u8 mode = BPF_MODE(insn->code); - u32 dreg = insn->dst_reg; - u32 sreg = insn->src_reg; - u32 spi, i, fr; - - if (insn->code == 0) - return 0; - if (env->log.level & BPF_LOG_LEVEL2) { - fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt)); - verbose(env, "mark_precise: frame%d: regs=%s ", - bt->frame, env->tmp_str_buf); - bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); - verbose(env, "stack=%s before ", env->tmp_str_buf); - verbose(env, "%d: ", idx); - verbose_insn(env, insn); - } - - /* If there is a history record that some registers gained range at this insn, - * propagate precision marks to those registers, so that bt_is_reg_set() - * accounts for these registers. - */ - bt_sync_linked_regs(bt, hist); - - if (class == BPF_ALU || class == BPF_ALU64) { - if (!bt_is_reg_set(bt, dreg)) - return 0; - if (opcode == BPF_END || opcode == BPF_NEG) { - /* sreg is reserved and unused - * dreg still need precision before this insn - */ - return 0; - } else if (opcode == BPF_MOV) { - if (BPF_SRC(insn->code) == BPF_X) { - /* dreg = sreg or dreg = (s8, s16, s32)sreg - * dreg needs precision after this insn - * sreg needs precision before this insn - */ - bt_clear_reg(bt, dreg); - if (sreg != BPF_REG_FP) - bt_set_reg(bt, sreg); - } else { - /* dreg = K - * dreg needs precision after this insn. - * Corresponding register is already marked - * as precise=true in this verifier state. - * No further markings in parent are necessary - */ - bt_clear_reg(bt, dreg); - } - } else { - if (BPF_SRC(insn->code) == BPF_X) { - /* dreg += sreg - * both dreg and sreg need precision - * before this insn - */ - if (sreg != BPF_REG_FP) - bt_set_reg(bt, sreg); - } /* else dreg += K - * dreg still needs precision before this insn - */ - } - } else if (class == BPF_LDX || is_atomic_load_insn(insn)) { - if (!bt_is_reg_set(bt, dreg)) - return 0; - bt_clear_reg(bt, dreg); - - /* scalars can only be spilled into stack w/o losing precision. - * Load from any other memory can be zero extended. - * The desire to keep that precision is already indicated - * by 'precise' mark in corresponding register of this state. - * No further tracking necessary. - */ - if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) - return 0; - /* dreg = *(u64 *)[fp - off] was a fill from the stack. - * that [fp - off] slot contains scalar that needs to be - * tracked with precision - */ - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); - bt_set_frame_slot(bt, fr, spi); - } else if (class == BPF_STX || class == BPF_ST) { - if (bt_is_reg_set(bt, dreg)) - /* stx & st shouldn't be using _scalar_ dst_reg - * to access memory. It means backtracking - * encountered a case of pointer subtraction. - */ - return -ENOTSUPP; - /* scalars can only be spilled into stack */ - if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) - return 0; - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); - if (!bt_is_frame_slot_set(bt, fr, spi)) - return 0; - bt_clear_frame_slot(bt, fr, spi); - if (class == BPF_STX) - bt_set_reg(bt, sreg); - } else if (class == BPF_JMP || class == BPF_JMP32) { - if (bpf_pseudo_call(insn)) { - int subprog_insn_idx, subprog; - - subprog_insn_idx = idx + insn->imm + 1; - subprog = find_subprog(env, subprog_insn_idx); - if (subprog < 0) - return -EFAULT; - - if (subprog_is_global(env, subprog)) { - /* check that jump history doesn't have any - * extra instructions from subprog; the next - * instruction after call to global subprog - * should be literally next instruction in - * caller program - */ - verifier_bug_if(idx + 1 != subseq_idx, env, - "extra insn from subprog"); - /* r1-r5 are invalidated after subprog call, - * so for global func call it shouldn't be set - * anymore - */ - if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - verifier_bug(env, "global subprog unexpected regs %x", - bt_reg_mask(bt)); - return -EFAULT; - } - /* global subprog always sets R0 */ - bt_clear_reg(bt, BPF_REG_0); - return 0; - } else { - /* static subprog call instruction, which - * means that we are exiting current subprog, - * so only r1-r5 could be still requested as - * precise, r0 and r6-r10 or any stack slot in - * the current frame should be zero by now - */ - if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { - verifier_bug(env, "static subprog unexpected regs %x", - bt_reg_mask(bt)); - return -EFAULT; - } - /* we are now tracking register spills correctly, - * so any instance of leftover slots is a bug - */ - if (bt_stack_mask(bt) != 0) { - verifier_bug(env, - "static subprog leftover stack slots %llx", - bt_stack_mask(bt)); - return -EFAULT; - } - /* propagate r1-r5 to the caller */ - for (i = BPF_REG_1; i <= BPF_REG_5; i++) { - if (bt_is_reg_set(bt, i)) { - bt_clear_reg(bt, i); - bt_set_frame_reg(bt, bt->frame - 1, i); - } - } - if (bt_subprog_exit(bt)) - return -EFAULT; - return 0; - } - } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) { - /* exit from callback subprog to callback-calling helper or - * kfunc call. Use idx/subseq_idx check to discern it from - * straight line code backtracking. - * Unlike the subprog call handling above, we shouldn't - * propagate precision of r1-r5 (if any requested), as they are - * not actually arguments passed directly to callback subprogs - */ - if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) { - verifier_bug(env, "callback unexpected regs %x", - bt_reg_mask(bt)); - return -EFAULT; - } - if (bt_stack_mask(bt) != 0) { - verifier_bug(env, "callback leftover stack slots %llx", - bt_stack_mask(bt)); - return -EFAULT; - } - /* clear r1-r5 in callback subprog's mask */ - for (i = BPF_REG_1; i <= BPF_REG_5; i++) - bt_clear_reg(bt, i); - if (bt_subprog_exit(bt)) - return -EFAULT; - return 0; - } else if (opcode == BPF_CALL) { - /* kfunc with imm==0 is invalid and fixup_kfunc_call will - * catch this error later. Make backtracking conservative - * with ENOTSUPP. - */ - if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0) - return -ENOTSUPP; - /* regular helper call sets R0 */ - bt_clear_reg(bt, BPF_REG_0); - if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - /* if backtracking was looking for registers R1-R5 - * they should have been found already. - */ - verifier_bug(env, "backtracking call unexpected regs %x", - bt_reg_mask(bt)); - return -EFAULT; - } - if (insn->src_reg == BPF_REG_0 && insn->imm == BPF_FUNC_tail_call - && subseq_idx - idx != 1) { - if (bt_subprog_enter(bt)) - return -EFAULT; - } - } else if (opcode == BPF_EXIT) { - bool r0_precise; - - /* Backtracking to a nested function call, 'idx' is a part of - * the inner frame 'subseq_idx' is a part of the outer frame. - * In case of a regular function call, instructions giving - * precision to registers R1-R5 should have been found already. - * In case of a callback, it is ok to have R1-R5 marked for - * backtracking, as these registers are set by the function - * invoking callback. - */ - if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx)) - for (i = BPF_REG_1; i <= BPF_REG_5; i++) - bt_clear_reg(bt, i); - if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { - verifier_bug(env, "backtracking exit unexpected regs %x", - bt_reg_mask(bt)); - return -EFAULT; - } - - /* BPF_EXIT in subprog or callback always returns - * right after the call instruction, so by checking - * whether the instruction at subseq_idx-1 is subprog - * call or not we can distinguish actual exit from - * *subprog* from exit from *callback*. In the former - * case, we need to propagate r0 precision, if - * necessary. In the former we never do that. - */ - r0_precise = subseq_idx - 1 >= 0 && - bpf_pseudo_call(&env->prog->insnsi[subseq_idx - 1]) && - bt_is_reg_set(bt, BPF_REG_0); - - bt_clear_reg(bt, BPF_REG_0); - if (bt_subprog_enter(bt)) - return -EFAULT; - - if (r0_precise) - bt_set_reg(bt, BPF_REG_0); - /* r6-r9 and stack slots will stay set in caller frame - * bitmasks until we return back from callee(s) - */ - return 0; - } else if (BPF_SRC(insn->code) == BPF_X) { - if (!bt_is_reg_set(bt, dreg) && !bt_is_reg_set(bt, sreg)) - return 0; - /* dreg <cond> sreg - * Both dreg and sreg need precision before - * this insn. If only sreg was marked precise - * before it would be equally necessary to - * propagate it to dreg. - */ - if (!hist || !(hist->flags & INSN_F_SRC_REG_STACK)) - bt_set_reg(bt, sreg); - if (!hist || !(hist->flags & INSN_F_DST_REG_STACK)) - bt_set_reg(bt, dreg); - } else if (BPF_SRC(insn->code) == BPF_K) { - /* dreg <cond> K - * Only dreg still needs precision before - * this insn, so for the K-based conditional - * there is nothing new to be marked. - */ - } - } else if (class == BPF_LD) { - if (!bt_is_reg_set(bt, dreg)) - return 0; - bt_clear_reg(bt, dreg); - /* It's ld_imm64 or ld_abs or ld_ind. - * For ld_imm64 no further tracking of precision - * into parent is necessary - */ - if (mode == BPF_IND || mode == BPF_ABS) - /* to be analyzed */ - return -ENOTSUPP; - } - /* Propagate precision marks to linked registers, to account for - * registers marked as precise in this function. - */ - bt_sync_linked_regs(bt, hist); - return 0; -} - -/* the scalar precision tracking algorithm: - * . at the start all registers have precise=false. - * . scalar ranges are tracked as normal through alu and jmp insns. - * . once precise value of the scalar register is used in: - * . ptr + scalar alu - * . if (scalar cond K|scalar) - * . helper_call(.., scalar, ...) where ARG_CONST is expected - * backtrack through the verifier states and mark all registers and - * stack slots with spilled constants that these scalar registers - * should be precise. - * . during state pruning two registers (or spilled stack slots) - * are equivalent if both are not precise. - * - * Note the verifier cannot simply walk register parentage chain, - * since many different registers and stack slots could have been - * used to compute single precise scalar. - * - * The approach of starting with precise=true for all registers and then - * backtrack to mark a register as not precise when the verifier detects - * that program doesn't care about specific value (e.g., when helper - * takes register as ARG_ANYTHING parameter) is not safe. - * - * It's ok to walk single parentage chain of the verifier states. - * It's possible that this backtracking will go all the way till 1st insn. - * All other branches will be explored for needing precision later. - * - * The backtracking needs to deal with cases like: - * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0) - * r9 -= r8 - * r5 = r9 - * if r5 > 0x79f goto pc+7 - * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff)) - * r5 += 1 - * ... - * call bpf_perf_event_output#25 - * where .arg5_type = ARG_CONST_SIZE_OR_ZERO - * - * and this case: - * r6 = 1 - * call foo // uses callee's r6 inside to compute r0 - * r0 += r6 - * if r0 == 0 goto - * - * to track above reg_mask/stack_mask needs to be independent for each frame. - * - * Also if parent's curframe > frame where backtracking started, - * the verifier need to mark registers in both frames, otherwise callees - * may incorrectly prune callers. This is similar to - * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") - * - * For now backtracking falls back into conservative marking. - */ -static void mark_all_scalars_precise(struct bpf_verifier_env *env, - struct bpf_verifier_state *st) -{ - struct bpf_func_state *func; - struct bpf_reg_state *reg; - int i, j; - - if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "mark_precise: frame%d: falling back to forcing all scalars precise\n", - st->curframe); - } - - /* big hammer: mark all scalars precise in this path. - * pop_stack may still get !precise scalars. - * We also skip current state and go straight to first parent state, - * because precision markings in current non-checkpointed state are - * not needed. See why in the comment in __mark_chain_precision below. - */ - for (st = st->parent; st; st = st->parent) { - for (i = 0; i <= st->curframe; i++) { - func = st->frame[i]; - for (j = 0; j < BPF_REG_FP; j++) { - reg = &func->regs[j]; - if (reg->type != SCALAR_VALUE || reg->precise) - continue; - reg->precise = true; - if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "force_precise: frame%d: forcing r%d to be precise\n", - i, j); - } - } - for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { - if (!is_spilled_reg(&func->stack[j])) - continue; - reg = &func->stack[j].spilled_ptr; - if (reg->type != SCALAR_VALUE || reg->precise) - continue; - reg->precise = true; - if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "force_precise: frame%d: forcing fp%d to be precise\n", - i, -(j + 1) * 8); - } - } - } - } -} - -static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st) -{ - struct bpf_func_state *func; - struct bpf_reg_state *reg; - int i, j; - - for (i = 0; i <= st->curframe; i++) { - func = st->frame[i]; - for (j = 0; j < BPF_REG_FP; j++) { - reg = &func->regs[j]; - if (reg->type != SCALAR_VALUE) - continue; - reg->precise = false; - } - for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { - if (!is_spilled_reg(&func->stack[j])) - continue; - reg = &func->stack[j].spilled_ptr; - if (reg->type != SCALAR_VALUE) - continue; - reg->precise = false; - } + bpf_bt_set_frame_slot(bt, e->frameno, e->spi); } } -/* - * __mark_chain_precision() backtracks BPF program instruction sequence and - * chain of verifier states making sure that register *regno* (if regno >= 0) - * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked - * SCALARS, as well as any other registers and slots that contribute to - * a tracked state of given registers/stack slots, depending on specific BPF - * assembly instructions (see backtrack_insns() for exact instruction handling - * logic). This backtracking relies on recorded jmp_history and is able to - * traverse entire chain of parent states. This process ends only when all the - * necessary registers/slots and their transitive dependencies are marked as - * precise. - * - * One important and subtle aspect is that precise marks *do not matter* in - * the currently verified state (current state). It is important to understand - * why this is the case. - * - * First, note that current state is the state that is not yet "checkpointed", - * i.e., it is not yet put into env->explored_states, and it has no children - * states as well. It's ephemeral, and can end up either a) being discarded if - * compatible explored state is found at some point or BPF_EXIT instruction is - * reached or b) checkpointed and put into env->explored_states, branching out - * into one or more children states. - * - * In the former case, precise markings in current state are completely - * ignored by state comparison code (see regsafe() for details). Only - * checkpointed ("old") state precise markings are important, and if old - * state's register/slot is precise, regsafe() assumes current state's - * register/slot as precise and checks value ranges exactly and precisely. If - * states turn out to be compatible, current state's necessary precise - * markings and any required parent states' precise markings are enforced - * after the fact with propagate_precision() logic, after the fact. But it's - * important to realize that in this case, even after marking current state - * registers/slots as precise, we immediately discard current state. So what - * actually matters is any of the precise markings propagated into current - * state's parent states, which are always checkpointed (due to b) case above). - * As such, for scenario a) it doesn't matter if current state has precise - * markings set or not. - * - * Now, for the scenario b), checkpointing and forking into child(ren) - * state(s). Note that before current state gets to checkpointing step, any - * processed instruction always assumes precise SCALAR register/slot - * knowledge: if precise value or range is useful to prune jump branch, BPF - * verifier takes this opportunity enthusiastically. Similarly, when - * register's value is used to calculate offset or memory address, exact - * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to - * what we mentioned above about state comparison ignoring precise markings - * during state comparison, BPF verifier ignores and also assumes precise - * markings *at will* during instruction verification process. But as verifier - * assumes precision, it also propagates any precision dependencies across - * parent states, which are not yet finalized, so can be further restricted - * based on new knowledge gained from restrictions enforced by their children - * states. This is so that once those parent states are finalized, i.e., when - * they have no more active children state, state comparison logic in - * is_state_visited() would enforce strict and precise SCALAR ranges, if - * required for correctness. - * - * To build a bit more intuition, note also that once a state is checkpointed, - * the path we took to get to that state is not important. This is crucial - * property for state pruning. When state is checkpointed and finalized at - * some instruction index, it can be correctly and safely used to "short - * circuit" any *compatible* state that reaches exactly the same instruction - * index. I.e., if we jumped to that instruction from a completely different - * code path than original finalized state was derived from, it doesn't - * matter, current state can be discarded because from that instruction - * forward having a compatible state will ensure we will safely reach the - * exit. States describe preconditions for further exploration, but completely - * forget the history of how we got here. - * - * This also means that even if we needed precise SCALAR range to get to - * finalized state, but from that point forward *that same* SCALAR register is - * never used in a precise context (i.e., it's precise value is not needed for - * correctness), it's correct and safe to mark such register as "imprecise" - * (i.e., precise marking set to false). This is what we rely on when we do - * not set precise marking in current state. If no child state requires - * precision for any given SCALAR register, it's safe to dictate that it can - * be imprecise. If any child state does require this register to be precise, - * we'll mark it precise later retroactively during precise markings - * propagation from child state to parent states. - * - * Skipping precise marking setting in current state is a mild version of - * relying on the above observation. But we can utilize this property even - * more aggressively by proactively forgetting any precise marking in the - * current state (which we inherited from the parent state), right before we - * checkpoint it and branch off into new child state. This is done by - * mark_all_scalars_imprecise() to hopefully get more permissive and generic - * finalized states which help in short circuiting more future states. - */ -static int __mark_chain_precision(struct bpf_verifier_env *env, - struct bpf_verifier_state *starting_state, - int regno, - bool *changed) -{ - struct bpf_verifier_state *st = starting_state; - struct backtrack_state *bt = &env->bt; - int first_idx = st->first_insn_idx; - int last_idx = starting_state->insn_idx; - int subseq_idx = -1; - struct bpf_func_state *func; - bool tmp, skip_first = true; - struct bpf_reg_state *reg; - int i, fr, err; - - if (!env->bpf_capable) - return 0; - - changed = changed ?: &tmp; - /* set frame number from which we are starting to backtrack */ - bt_init(bt, starting_state->curframe); - - /* Do sanity checks against current state of register and/or stack - * slot, but don't set precise flag in current state, as precision - * tracking in the current state is unnecessary. - */ - func = st->frame[bt->frame]; - if (regno >= 0) { - reg = &func->regs[regno]; - if (reg->type != SCALAR_VALUE) { - verifier_bug(env, "backtracking misuse"); - return -EFAULT; - } - bt_set_reg(bt, regno); - } - - if (bt_empty(bt)) - return 0; - - for (;;) { - DECLARE_BITMAP(mask, 64); - u32 history = st->jmp_history_cnt; - struct bpf_jmp_history_entry *hist; - - if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", - bt->frame, last_idx, first_idx, subseq_idx); - } - - if (last_idx < 0) { - /* we are at the entry into subprog, which - * is expected for global funcs, but only if - * requested precise registers are R1-R5 - * (which are global func's input arguments) - */ - if (st->curframe == 0 && - st->frame[0]->subprogno > 0 && - st->frame[0]->callsite == BPF_MAIN_FUNC && - bt_stack_mask(bt) == 0 && - (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) == 0) { - bitmap_from_u64(mask, bt_reg_mask(bt)); - for_each_set_bit(i, mask, 32) { - reg = &st->frame[0]->regs[i]; - bt_clear_reg(bt, i); - if (reg->type == SCALAR_VALUE) { - reg->precise = true; - *changed = true; - } - } - return 0; - } - - verifier_bug(env, "backtracking func entry subprog %d reg_mask %x stack_mask %llx", - st->frame[0]->subprogno, bt_reg_mask(bt), bt_stack_mask(bt)); - return -EFAULT; - } - - for (i = last_idx;;) { - if (skip_first) { - err = 0; - skip_first = false; - } else { - hist = get_jmp_hist_entry(st, history, i); - err = backtrack_insn(env, i, subseq_idx, hist, bt); - } - if (err == -ENOTSUPP) { - mark_all_scalars_precise(env, starting_state); - bt_reset(bt); - return 0; - } else if (err) { - return err; - } - if (bt_empty(bt)) - /* Found assignment(s) into tracked register in this state. - * Since this state is already marked, just return. - * Nothing to be tracked further in the parent state. - */ - return 0; - subseq_idx = i; - i = get_prev_insn_idx(st, i, &history); - if (i == -ENOENT) - break; - if (i >= env->prog->len) { - /* This can happen if backtracking reached insn 0 - * and there are still reg_mask or stack_mask - * to backtrack. - * It means the backtracking missed the spot where - * particular register was initialized with a constant. - */ - verifier_bug(env, "backtracking idx %d", i); - return -EFAULT; - } - } - st = st->parent; - if (!st) - break; - - for (fr = bt->frame; fr >= 0; fr--) { - func = st->frame[fr]; - bitmap_from_u64(mask, bt_frame_reg_mask(bt, fr)); - for_each_set_bit(i, mask, 32) { - reg = &func->regs[i]; - if (reg->type != SCALAR_VALUE) { - bt_clear_frame_reg(bt, fr, i); - continue; - } - if (reg->precise) { - bt_clear_frame_reg(bt, fr, i); - } else { - reg->precise = true; - *changed = true; - } - } - - bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); - for_each_set_bit(i, mask, 64) { - if (verifier_bug_if(i >= func->allocated_stack / BPF_REG_SIZE, - env, "stack slot %d, total slots %d", - i, func->allocated_stack / BPF_REG_SIZE)) - return -EFAULT; - - if (!is_spilled_scalar_reg(&func->stack[i])) { - bt_clear_frame_slot(bt, fr, i); - continue; - } - reg = &func->stack[i].spilled_ptr; - if (reg->precise) { - bt_clear_frame_slot(bt, fr, i); - } else { - reg->precise = true; - *changed = true; - } - } - if (env->log.level & BPF_LOG_LEVEL2) { - fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, - bt_frame_reg_mask(bt, fr)); - verbose(env, "mark_precise: frame%d: parent state regs=%s ", - fr, env->tmp_str_buf); - bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, - bt_frame_stack_mask(bt, fr)); - verbose(env, "stack=%s: ", env->tmp_str_buf); - print_verifier_state(env, st, fr, true); - } - } - - if (bt_empty(bt)) - return 0; - - subseq_idx = first_idx; - last_idx = st->last_insn_idx; - first_idx = st->first_insn_idx; - } - - /* if we still have requested precise regs or slots, we missed - * something (e.g., stack access through non-r10 register), so - * fallback to marking all precise - */ - if (!bt_empty(bt)) { - mark_all_scalars_precise(env, starting_state); - bt_reset(bt); - } - - return 0; -} - int mark_chain_precision(struct bpf_verifier_env *env, int regno) { - return __mark_chain_precision(env, env->cur_state, regno, NULL); + return bpf_mark_chain_precision(env, env->cur_state, regno, NULL); } /* mark_chain_precision_batch() assumes that env->bt is set in the caller to @@ -5026,7 +3654,7 @@ int mark_chain_precision(struct bpf_verifier_env *env, int regno) static int mark_chain_precision_batch(struct bpf_verifier_env *env, struct bpf_verifier_state *starting_state) { - return __mark_chain_precision(env, starting_state, -1, NULL); + return bpf_mark_chain_precision(env, starting_state, -1, NULL); } static bool is_spillable_regtype(enum bpf_reg_type type) @@ -5056,11 +3684,6 @@ static bool is_spillable_regtype(enum bpf_reg_type type) } } -/* Does this register contain a constant zero? */ -static bool register_is_null(struct bpf_reg_state *reg) -{ - return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); -} /* check if register is a constant scalar value */ static bool is_reg_const(struct bpf_reg_state *reg, bool subreg32) @@ -5084,27 +3707,30 @@ static bool __is_pointer_value(bool allow_ptr_leaks, return reg->type != SCALAR_VALUE; } +static void clear_scalar_id(struct bpf_reg_state *reg) +{ + reg->id = 0; + reg->delta = 0; +} + static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, struct bpf_reg_state *src_reg) { if (src_reg->type != SCALAR_VALUE) return; - - if (src_reg->id & BPF_ADD_CONST) { - /* - * The verifier is processing rX = rY insn and - * rY->id has special linked register already. - * Cleared it, since multiple rX += const are not supported. - */ - src_reg->id = 0; - src_reg->off = 0; - } - + /* + * The verifier is processing rX = rY insn and + * rY->id has special linked register already. + * Cleared it, since multiple rX += const are not supported. + */ + if (src_reg->id & BPF_ADD_CONST) + clear_scalar_id(src_reg); + /* + * Ensure that src_reg has a valid ID that will be copied to + * dst_reg and then will be used by sync_linked_regs() to + * propagate min/max range. + */ if (!src_reg->id && !tnum_is_const(src_reg->var_off)) - /* Ensure that src_reg has a valid ID that will be copied to - * dst_reg and then will be used by sync_linked_regs() to - * propagate min/max range. - */ src_reg->id = ++env->id_gen; } @@ -5166,6 +3792,18 @@ static void check_fastcall_stack_contract(struct bpf_verifier_env *env, } } +static void scrub_special_slot(struct bpf_func_state *state, int spi) +{ + int i; + + /* regular write of data into stack destroys any spilled ptr */ + state->stack[spi].spilled_ptr.type = NOT_INIT; + /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */ + if (is_stack_slot_special(&state->stack[spi])) + for (i = 0; i < BPF_REG_SIZE; i++) + scrub_spilled_slot(&state->stack[spi].slot_type[i]); +} + /* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -5185,8 +3823,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, * so it's aligned access and [off, off + size) are within stack limits */ if (!env->allow_ptr_leaks && - is_spilled_reg(&state->stack[spi]) && - !is_spilled_scalar_reg(&state->stack[spi]) && + bpf_is_spilled_reg(&state->stack[spi]) && + !bpf_is_spilled_scalar_reg(&state->stack[spi]) && size != BPF_REG_SIZE) { verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; @@ -5215,18 +3853,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, if (err) return err; - if (!(off % BPF_REG_SIZE) && size == BPF_REG_SIZE) { - /* only mark the slot as written if all 8 bytes were written - * otherwise read propagation may incorrectly stop too soon - * when stack slots are partially written. - * This heuristic means that read propagation will be - * conservative, since it will add reg_live_read marks - * to stack slots all the way to first state when programs - * writes+reads less than 8 bytes - */ - bpf_mark_stack_write(env, state->frameno, BIT(spi)); - } - check_fastcall_stack_contract(env, state, insn_idx, off); mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && reg->type == SCALAR_VALUE && env->bpf_capable) { @@ -5263,15 +3889,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } else { u8 type = STACK_MISC; - /* regular write of data into stack destroys any spilled ptr */ - state->stack[spi].spilled_ptr.type = NOT_INIT; - /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */ - if (is_stack_slot_special(&state->stack[spi])) - for (i = 0; i < BPF_REG_SIZE; i++) - scrub_spilled_slot(&state->stack[spi].slot_type[i]); + scrub_special_slot(state, spi); /* when we zero initialize stack slots mark them as such */ - if ((reg && register_is_null(reg)) || + if ((reg && bpf_register_is_null(reg)) || (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { /* STACK_ZERO case happened because register spill * wasn't properly aligned at the stack slot boundary, @@ -5292,7 +3913,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags, 0); + return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5302,7 +3923,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, * tracks the effects of the write, considering that each stack slot in the * dynamic range is potentially written to. * - * 'off' includes 'regno->off'. * 'value_regno' can be -1, meaning that an unknown value is being written to * the stack. * @@ -5338,14 +3958,14 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, max_off = ptr_reg->smax_value + off + size; if (value_regno >= 0) value_reg = &cur->regs[value_regno]; - if ((value_reg && register_is_null(value_reg)) || + if ((value_reg && bpf_register_is_null(value_reg)) || (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0)) writing_zero = true; for (i = min_off; i < max_off; i++) { int spi; - spi = __get_spi(i); + spi = bpf_get_spi(i); err = destroy_if_dynptr_stack_slot(env, state, spi); if (err) return err; @@ -5383,7 +4003,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, * maintain the spill type. */ if (writing_zero && *stype == STACK_SPILL && - is_spilled_scalar_reg(&state->stack[spi])) { + bpf_is_spilled_scalar_reg(&state->stack[spi])) { struct bpf_reg_state *spill_reg = &state->stack[spi].spilled_ptr; if (tnum_is_const(spill_reg->var_off) && spill_reg->var_off.value == 0) { @@ -5392,8 +4012,13 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, } } - /* Erase all other spilled pointers. */ - state->stack[spi].spilled_ptr.type = NOT_INIT; + /* + * Scrub slots if variable-offset stack write goes over spilled pointers. + * Otherwise bpf_is_spilled_reg() may == true && spilled_ptr.type == NOT_INIT + * and valid program is rejected by check_stack_read_fixed_off() + * with obscure "invalid size of register fill" message. + */ + scrub_special_slot(state, spi); /* Update the slot type. */ new_type = STACK_MISC; @@ -5408,8 +4033,10 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, * For privileged programs, we will accept such reads to slots * that may or may not be written because, if we're reject * them, the error would be too confusing. + * Conservatively, treat STACK_POISON in a similar way. */ - if (*stype == STACK_INVALID && !env->allow_uninit_stack) { + if ((*stype == STACK_INVALID || *stype == STACK_POISON) && + !env->allow_uninit_stack) { verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d", insn_idx, i); return -EINVAL; @@ -5484,18 +4111,14 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg; u8 *stype, type; int insn_flags = insn_stack_access_flags(reg_state->frameno, spi); - int err; stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; mark_stack_slot_scratched(env, spi); check_fastcall_stack_contract(env, state, env->insn_idx, off); - err = bpf_mark_stack_read(env, reg_state->frameno, env->insn_idx, BIT(spi)); - if (err) - return err; - if (is_spilled_reg(®_state->stack[spi])) { + if (bpf_is_spilled_reg(®_state->stack[spi])) { u8 spill_size = 1; for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--) @@ -5531,7 +4154,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * coerce_reg_to_size will adjust the boundaries. */ if (get_reg_width(reg) > size * BITS_PER_BYTE) - state->regs[dst_regno].id = 0; + clear_scalar_id(&state->regs[dst_regno]); } else { int spill_cnt = 0, zero_cnt = 0; @@ -5549,8 +4172,13 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, } if (type == STACK_INVALID && env->allow_uninit_stack) continue; - verbose(env, "invalid read from stack off %d+%d size %d\n", - off, i, size); + if (type == STACK_POISON) { + verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n", + off, i, size); + } else { + verbose(env, "invalid read from stack off %d+%d size %d\n", + off, i, size); + } return -EACCES; } @@ -5599,8 +4227,13 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, continue; if (type == STACK_INVALID && env->allow_uninit_stack) continue; - verbose(env, "invalid read from stack off %d+%d size %d\n", - off, i, size); + if (type == STACK_POISON) { + verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n", + off, i, size); + } else { + verbose(env, "invalid read from stack off %d+%d size %d\n", + off, i, size); + } return -EACCES; } if (dst_regno >= 0) @@ -5608,7 +4241,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_jmp_history(env, env->cur_state, insn_flags, 0); + return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5646,7 +4279,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, { /* The state of the source register. */ struct bpf_reg_state *reg = reg_state(env, ptr_regno); - struct bpf_func_state *ptr_state = func(env, reg); + struct bpf_func_state *ptr_state = bpf_func(env, reg); int err; int min_off, max_off; @@ -5678,7 +4311,7 @@ static int check_stack_read(struct bpf_verifier_env *env, int dst_regno) { struct bpf_reg_state *reg = reg_state(env, ptr_regno); - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int err; /* Some accesses are only permitted with a static offset. */ bool var_off = !tnum_is_const(reg->var_off); @@ -5724,7 +4357,6 @@ static int check_stack_read(struct bpf_verifier_env *env, * check_stack_write_var_off. * * 'ptr_regno' is the register used as a pointer into the stack. - * 'off' includes 'ptr_regno->off', but not its variable offset (if any). * 'value_regno' is the register whose value we're writing to the stack. It can * be -1, meaning that we're not writing from a register. * @@ -5735,7 +4367,7 @@ static int check_stack_write(struct bpf_verifier_env *env, int value_regno, int insn_idx) { struct bpf_reg_state *reg = reg_state(env, ptr_regno); - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int err; if (tnum_is_const(reg->var_off)) { @@ -5761,14 +4393,14 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, u32 cap = bpf_map_flags_to_cap(map); if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { - verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", - map->value_size, off, size); + verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n", + map->value_size, reg->smin_value + off, size); return -EACCES; } if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { - verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n", - map->value_size, off, size); + verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n", + map->value_size, reg->smin_value + off, size); return -EACCES; } @@ -5802,6 +4434,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, off, mem_size); break; + case PTR_TO_CTX: + verbose(env, "invalid access to context, ctx_size=%d off=%d size=%d\n", + mem_size, off, size); + break; case PTR_TO_MEM: default: verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n", @@ -5875,24 +4511,24 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, * is only allowed in its original, unmodified form. */ - if (reg->off < 0) { - verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); return -EACCES; } - if (!fixed_off_ok && reg->off) { - verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); + if (reg->smin_value < 0) { + verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n", + reg_type_str(env, reg->type), regno, reg->var_off.value); return -EACCES; } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable %s access var_off=%s disallowed\n", - reg_type_str(env, reg->type), tn_buf); + if (!fixed_off_ok && reg->var_off.value != 0) { + verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n", + reg_type_str(env, reg->type), regno, reg->var_off.value); return -EACCES; } @@ -5913,6 +4549,9 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, int perm_flags; const char *reg_name = ""; + if (base_type(reg->type) != PTR_TO_BTF_ID) + goto bad_type; + if (btf_is_kernel(reg->btf)) { perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; @@ -5925,7 +4564,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, perm_flags |= MEM_PERCPU; } - if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) + if (type_flag(reg->type) & ~perm_flags) goto bad_type; /* We need to verify reg->type and reg->btf, before accessing reg->btf */ @@ -5934,14 +4573,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, /* For ref_ptr case, release function check should ensure we get one * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the * normal store of unreferenced kptr, we must ensure var_off is zero. - * Since ref_ptr cannot be accessed directly by BPF insns, checks for - * reg->off and reg->ref_obj_id are not needed here. + * Since ref_ptr cannot be accessed directly by BPF insns, check for + * reg->ref_obj_id is not needed here. */ if (__check_ptr_off_reg(env, reg, regno, true)) return -EACCES; /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and - * we also need to take into account the reg->off. + * we also need to take into account the reg->var_off. * * We want to support cases like: * @@ -5952,19 +4591,19 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * * struct foo *v; * v = func(); // PTR_TO_BTF_ID - * val->foo = v; // reg->off is zero, btf and btf_id match type - * val->bar = &v->br; // reg->off is still zero, but we need to retry with + * val->foo = v; // reg->var_off is zero, btf and btf_id match type + * val->bar = &v->br; // reg->var_off is still zero, but we need to retry with * // first member type of struct after comparison fails - * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked + * val->baz = &v->bz; // reg->var_off is non-zero, so struct needs to be walked * // to match type * - * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off + * In the kptr_ref case, check_func_arg_reg_off already ensures reg->var_off * is zero. We must also ensure that btf_struct_ids_match does not walk * the struct to match type against first member of struct, i.e. reject * second case from above. Hence, when type is BPF_KPTR_REF, we set * strict mode to true for type match. */ - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->var_off.value, kptr_field->kptr.btf, kptr_field->kptr.btf_id, kptr_field->type != BPF_KPTR_UNREF)) goto bad_type; @@ -6128,7 +4767,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, return ret; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); - if (!register_is_null(val_reg) && + if (!bpf_register_is_null(val_reg) && map_kptr_match_type(env, kptr_field, val_reg, value_regno)) return -EACCES; } else if (class == BPF_ST) { @@ -6223,11 +4862,9 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return 0; } -#define MAX_PACKET_OFF 0xffff - static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, - const struct bpf_call_arg_meta *meta, - enum bpf_access_type t) + const struct bpf_call_arg_meta *meta, + enum bpf_access_type t) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); @@ -6273,27 +4910,14 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, struct bpf_reg_state *reg = reg_state(env, regno); int err; - /* We may have added a variable offset to the packet pointer; but any - * reg->range we have comes after that. We are only checking the fixed - * offset. - */ - - /* We don't allow negative numbers, because we aren't tracking enough - * detail to prove they're safe. - */ - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); - return -EACCES; + if (reg->range < 0) { + verbose(env, "R%d offset is outside of the packet\n", regno); + return -EINVAL; } - err = reg->range < 0 ? -EINVAL : - __check_mem_access(env, regno, off, size, reg->range, - zero_size_allowed); - if (err) { - verbose(env, "R%d offset is outside of the packet\n", regno); + err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed); + if (err) return err; - } /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, @@ -6305,12 +4929,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, max_t(u32, env->prog->aux->max_pkt_offset, off + reg->umax_value + size - 1); - return err; + return 0; +} + +static bool is_var_ctx_off_allowed(struct bpf_prog *prog) +{ + return resolve_prog_type(prog) == BPF_PROG_TYPE_SYSCALL; } /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ -static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, - enum bpf_access_type t, struct bpf_insn_access_aux *info) +static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, + enum bpf_access_type t, struct bpf_insn_access_aux *info) { if (env->ops->is_valid_access && env->ops->is_valid_access(off, size, t, env->prog, info)) { @@ -6341,6 +4970,34 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } +static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, + int off, int access_size, enum bpf_access_type t, + struct bpf_insn_access_aux *info) +{ + /* + * Program types that don't rewrite ctx accesses can safely + * dereference ctx pointers with fixed offsets. + */ + bool var_off_ok = is_var_ctx_off_allowed(env->prog); + bool fixed_off_ok = !env->ops->convert_ctx_access; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; + int err; + + if (var_off_ok) + err = check_mem_region_access(env, regno, off, access_size, U16_MAX, false); + else + err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok); + if (err) + return err; + off += reg->umax_value; + + err = __check_ctx_access(env, insn_idx, off, access_size, t, info); + if (err) + verbose_linfo(env, insn_idx, "; "); + return err; +} + static int check_flow_keys_access(struct bpf_verifier_env *env, int off, int size) { @@ -6522,14 +5179,14 @@ static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, */ ip_align = 2; - reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off)); + reg_off = tnum_add(reg->var_off, tnum_const(ip_align + off)); if (!tnum_is_aligned(reg_off, size)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "misaligned packet access off %d+%s+%d+%d size %d\n", - ip_align, tn_buf, reg->off, off, size); + "misaligned packet access off %d+%s+%d size %d\n", + ip_align, tn_buf, off, size); return -EACCES; } @@ -6547,13 +5204,13 @@ static int check_generic_ptr_alignment(struct bpf_verifier_env *env, if (!strict || size == 1) return 0; - reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off)); + reg_off = tnum_add(reg->var_off, tnum_const(off)); if (!tnum_is_aligned(reg_off, size)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "misaligned %saccess off %s+%d+%d size %d\n", - pointer_desc, tn_buf, reg->off, off, size); + verbose(env, "misaligned %saccess off %s+%d size %d\n", + pointer_desc, tn_buf, off, size); return -EACCES; } @@ -6656,22 +5313,30 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth) return round_up(max_t(u32, stack_depth, 1), 32); } +/* temporary state used for call frame depth calculation */ +struct bpf_subprog_call_depth_info { + int ret_insn; /* caller instruction where we return to. */ + int caller; /* caller subprogram idx */ + int frame; /* # of consecutive static call stack frames on top of stack */ +}; + /* starting from main bpf function walk all instructions of the function * and recursively walk all callees that given function can call. * Ignore jump and exit insns. - * Since recursion is prevented by check_cfg() this algorithm - * only needs a local stack of MAX_CALL_FRAMES to remember callsites */ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx, + struct bpf_subprog_call_depth_info *dinfo, bool priv_stack_supported) { struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; int depth = 0, frame = 0, i, subprog_end, subprog_depth; bool tail_call_reachable = false; - int ret_insn[MAX_CALL_FRAMES]; - int ret_prog[MAX_CALL_FRAMES]; - int j; + int total; + int tmp; + + /* no caller idx */ + dinfo[idx].caller = -1; i = subprog[idx].start; if (!priv_stack_supported) @@ -6723,8 +5388,12 @@ process_func: } else { depth += subprog_depth; if (depth > MAX_BPF_STACK) { + total = 0; + for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) + total++; + verbose(env, "combined stack size of %d calls is %d. Too large\n", - frame + 1, depth); + total, depth); return -EACCES; } } @@ -6738,10 +5407,8 @@ continue_func: if (!is_bpf_throw_kfunc(insn + i)) continue; - if (subprog[idx].is_cb) - err = true; - for (int c = 0; c < frame && !err; c++) { - if (subprog[ret_prog[c]].is_cb) { + for (tmp = idx; tmp >= 0 && !err; tmp = dinfo[tmp].caller) { + if (subprog[tmp].is_cb) { err = true; break; } @@ -6757,12 +5424,10 @@ continue_func: if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ - ret_insn[frame] = i + 1; - ret_prog[frame] = idx; /* find the callee */ next_insn = i + insn[i].imm + 1; - sidx = find_subprog(env, next_insn); + sidx = bpf_find_subprog(env, next_insn); if (verifier_bug_if(sidx < 0, env, "callee not found at insn %d", next_insn)) return -EFAULT; if (subprog[sidx].is_async_cb) { @@ -6778,7 +5443,16 @@ continue_func: return -EINVAL; } } + + /* store caller info for after we return from callee */ + dinfo[idx].frame = frame; + dinfo[idx].ret_insn = i + 1; + + /* push caller idx into callee's dinfo */ + dinfo[sidx].caller = idx; + i = next_insn; + idx = sidx; if (!priv_stack_supported) subprog[idx].priv_stack_mode = NO_PRIV_STACK; @@ -6786,7 +5460,7 @@ continue_func: if (subprog[idx].has_tail_call) tail_call_reachable = true; - frame++; + frame = bpf_subprog_is_global(env, idx) ? 0 : frame + 1; if (frame >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep !\n", frame); @@ -6800,12 +5474,12 @@ continue_func: * tail call counter throughout bpf2bpf calls combined with tailcalls */ if (tail_call_reachable) - for (j = 0; j < frame; j++) { - if (subprog[ret_prog[j]].is_exception_cb) { + for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) { + if (subprog[tmp].is_exception_cb) { verbose(env, "cannot tail call within exception cb\n"); return -EINVAL; } - subprog[ret_prog[j]].tail_call_reachable = true; + subprog[tmp].tail_call_reachable = true; } if (subprog[0].tail_call_reachable) env->prog->aux->tail_call_reachable = true; @@ -6813,23 +5487,33 @@ continue_func: /* end of for() loop means the last insn of the 'subprog' * was reached. Doesn't matter whether it was JA or EXIT */ - if (frame == 0) + if (frame == 0 && dinfo[idx].caller < 0) return 0; if (subprog[idx].priv_stack_mode != PRIV_STACK_ADAPTIVE) depth -= round_up_stack_depth(env, subprog[idx].stack_depth); - frame--; - i = ret_insn[frame]; - idx = ret_prog[frame]; + + /* pop caller idx from callee */ + idx = dinfo[idx].caller; + + /* retrieve caller state from its frame */ + frame = dinfo[idx].frame; + i = dinfo[idx].ret_insn; + goto continue_func; } static int check_max_stack_depth(struct bpf_verifier_env *env) { enum priv_stack_mode priv_stack_mode = PRIV_STACK_UNKNOWN; + struct bpf_subprog_call_depth_info *dinfo; struct bpf_subprog_info *si = env->subprog_info; bool priv_stack_supported; int ret; + dinfo = kvcalloc(env->subprog_cnt, sizeof(*dinfo), GFP_KERNEL_ACCOUNT); + if (!dinfo) + return -ENOMEM; + for (int i = 0; i < env->subprog_cnt; i++) { if (si[i].has_tail_call) { priv_stack_mode = NO_PRIV_STACK; @@ -6851,9 +5535,12 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) for (int i = env->subprog_cnt - 1; i >= 0; i--) { if (!i || si[i].is_async_cb) { priv_stack_supported = !i && priv_stack_mode == PRIV_STACK_ADAPTIVE; - ret = check_max_stack_depth_subprog(env, i, priv_stack_supported); - if (ret < 0) + ret = check_max_stack_depth_subprog(env, i, dinfo, + priv_stack_supported); + if (ret < 0) { + kvfree(dinfo); return ret; + } } } @@ -6864,21 +5551,10 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) } } - return 0; -} + kvfree(dinfo); -#ifndef CONFIG_BPF_JIT_ALWAYS_ON -static int get_callee_stack_depth(struct bpf_verifier_env *env, - const struct bpf_insn *insn, int idx) -{ - int start = idx + insn->imm + 1, subprog; - - subprog = find_subprog(env, start); - if (verifier_bug_if(subprog < 0, env, "get stack depth: no program at insn %d", start)) - return -EFAULT; - return env->subprog_info[subprog].stack_depth; + return 0; } -#endif static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, @@ -6891,7 +5567,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env, regno, buf_info, off, size); return -EACCES; } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -6914,8 +5590,8 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, if (err) return err; - if (off + size > env->prog->aux->max_tp_access) - env->prog->aux->max_tp_access = off + size; + env->prog->aux->max_tp_access = max(reg->var_off.value + off + size, + env->prog->aux->max_tp_access); return 0; } @@ -6933,8 +5609,7 @@ static int check_buffer_access(struct bpf_verifier_env *env, if (err) return err; - if (off + size > *max_access) - *max_access = off + size; + *max_access = max(reg->var_off.value + off + size, *max_access); return 0; } @@ -7121,7 +5796,7 @@ out: set_sext32_default_val(reg, size); } -static bool bpf_map_is_rdonly(const struct bpf_map *map) +bool bpf_map_is_rdonly(const struct bpf_map *map) { /* A map is considered read-only if the following condition are true: * @@ -7141,8 +5816,8 @@ static bool bpf_map_is_rdonly(const struct bpf_map *map) !bpf_map_write_active(map); } -static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, - bool is_ldsx) +int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val, + bool is_ldsx) { void *ptr; u64 addr; @@ -7327,13 +6002,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, tname); return -EINVAL; } - if (off < 0) { - verbose(env, - "R%d is ptr_%s invalid negative access: off=%d\n", - regno, tname, off); - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -7343,6 +6013,15 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + off += reg->var_off.value; + + if (off < 0) { + verbose(env, + "R%d is ptr_%s invalid negative access: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (reg->type & MEM_USER) { verbose(env, "R%d is ptr_%s access user memory: off=%d\n", @@ -7554,7 +6233,7 @@ static int check_stack_access_within_bounds( enum bpf_access_type type) { struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); s64 min_off, max_off; int err; char *err_extra; @@ -7589,8 +6268,8 @@ static int check_stack_access_within_bounds( if (err) { if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid%s stack R%d off=%d size=%d\n", - err_extra, regno, off, access_size); + verbose(env, "invalid%s stack R%d off=%lld size=%d\n", + err_extra, regno, min_off, access_size); } else { char tn_buf[48]; @@ -7618,6 +6297,23 @@ static bool get_func_retval_range(struct bpf_prog *prog, return false; } +static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val) +{ + struct bpf_reg_state fake_reg; + + if (!val) + return; + + fake_reg.type = SCALAR_VALUE; + __mark_reg_known(&fake_reg, val); + + scalar32_min_max_add(dst_reg, &fake_reg); + scalar_min_max_add(dst_reg, &fake_reg); + dst_reg->var_off = tnum_add(dst_reg->var_off, fake_reg.var_off); + + reg_bounds_sync(dst_reg); +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -7636,14 +6332,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (size < 0) return size; - /* alignment checks will add in reg->off themselves */ err = check_ptr_alignment(env, reg, off, size, strict_alignment_once); if (err) return err; - /* for access checks, reg->off is just part of off */ - off += reg->off; - if (reg->type == PTR_TO_MAP_KEY) { if (t == BPF_WRITE) { verbose(env, "write to change key R%d not allowed\n", regno); @@ -7703,6 +6395,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } copy_register_state(®s[value_regno], reg); + add_scalar_to_reg(®s[value_regno], off); regs[value_regno].type = PTR_TO_INSN; } else { mark_reg_unknown(env, regs, value_regno); @@ -7740,12 +6433,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { - struct bpf_retval_range range; struct bpf_insn_access_aux info = { .reg_type = SCALAR_VALUE, .is_ldsx = is_ldsx, .log = &env->log, }; + struct bpf_retval_range range; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { @@ -7753,13 +6446,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ptr_off_reg(env, reg, regno); - if (err < 0) - return err; - - err = check_ctx_access(env, insn_idx, off, size, t, &info); - if (err) - verbose_linfo(env, insn_idx, "; "); + err = check_ctx_access(env, insn_idx, regno, off, size, t, &info); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a * PTR_TO_PACKET[_META,_END]. In the latter @@ -7851,7 +6538,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == CONST_PTR_TO_MAP) { err = check_ptr_to_map_access(env, regs, regno, off, size, t, value_regno); - } else if (base_type(reg->type) == PTR_TO_BUF) { + } else if (base_type(reg->type) == PTR_TO_BUF && + !type_may_be_null(reg->type)) { bool rdonly_mem = type_is_rdonly_mem(reg->type); u32 *max_access; @@ -8122,8 +6810,6 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) * on the access type and privileges, that all elements of the stack are * initialized. * - * 'off' includes 'regno->off', but not its dynamic part (if any). - * * All registers that have been spilled on the stack in the slots within the * read offsets are marked as read. */ @@ -8133,21 +6819,27 @@ static int check_stack_range_initialized( enum bpf_access_type type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int err, min_off, max_off, i, j, slot, spi; /* Some accesses can write anything into the stack, others are * read-only. */ - bool clobber = false; + bool clobber = type == BPF_WRITE; + /* + * Negative access_size signals global subprog/kfunc arg check where + * STACK_POISON slots are acceptable. static stack liveness + * might have determined that subprog doesn't read them, + * but BTF based global subprog validation isn't accurate enough. + */ + bool allow_poison = access_size < 0 || clobber; + + access_size = abs(access_size); if (access_size == 0 && !zero_size_allowed) { verbose(env, "invalid zero-sized read\n"); return -EACCES; } - if (type == BPF_WRITE) - clobber = true; - err = check_stack_access_within_bounds(env, regno, off, access_size, type); if (err) return err; @@ -8199,7 +6891,7 @@ static int check_stack_range_initialized( for (i = min_off; i < max_off + access_size; i++) { int stack_off = -i - 1; - spi = __get_spi(i); + spi = bpf_get_spi(i); /* raw_mode may write past allocated_stack */ if (state->allocated_stack <= stack_off) continue; @@ -8235,7 +6927,7 @@ static int check_stack_range_initialized( goto mark; } - if (is_spilled_reg(&state->stack[spi]) && + if (bpf_is_spilled_reg(&state->stack[spi]) && (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || env->allow_ptr_leaks)) { if (clobber) { @@ -8246,7 +6938,12 @@ static int check_stack_range_initialized( goto mark; } - if (tnum_is_const(reg->var_off)) { + if (*stype == STACK_POISON) { + if (allow_poison) + goto mark; + verbose(env, "reading from stack R%d off %d+%d size %d, slot poisoned by dead code elimination\n", + regno, min_off, i - min_off, access_size); + } else if (tnum_is_const(reg->var_off)) { verbose(env, "invalid read from stack R%d off %d+%d size %d\n", regno, min_off, i - min_off, access_size); } else { @@ -8258,17 +6955,7 @@ static int check_stack_range_initialized( } return -EACCES; mark: - /* reading any byte out of 8-byte 'spill_slot' will cause - * the whole slot to be marked as 'read' - */ - err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi)); - if (err) - return err; - /* We do not call bpf_mark_stack_write(), as we can not - * be sure that whether stack slot is written to or not. Hence, - * we must still conservatively propagate reads upwards even if - * helper may write to the entire memory range. - */ + ; } return 0; } @@ -8284,7 +6971,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, regno, reg->off, access_size, + return check_packet_access(env, regno, 0, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: if (access_type == BPF_WRITE) { @@ -8292,12 +6979,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, reg_type_str(env, reg->type)); return -EACCES; } - return check_mem_region_access(env, regno, reg->off, access_size, + return check_mem_region_access(env, regno, 0, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, reg->off, access_size, access_type)) + if (check_map_access_type(env, regno, 0, access_size, access_type)) return -EACCES; - return check_map_access(env, regno, reg->off, access_size, + return check_map_access(env, regno, 0, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { @@ -8307,7 +6994,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; } } - return check_mem_region_access(env, regno, reg->off, + return check_mem_region_access(env, regno, 0, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: @@ -8322,39 +7009,33 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } else { max_access = &env->prog->aux->max_rdwr_access; } - return check_buffer_access(env, reg, regno, reg->off, + return check_buffer_access(env, reg, regno, 0, access_size, zero_size_allowed, max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, - regno, reg->off, access_size, + regno, 0, access_size, zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: - return check_ptr_to_btf_access(env, regs, regno, reg->off, + return check_ptr_to_btf_access(env, regs, regno, 0, access_size, BPF_READ, -1); case PTR_TO_CTX: - /* in case the function doesn't know how to access the context, - * (because we are in a program of type SYSCALL for example), we - * can not statically check its size. - * Dynamically check it now. - */ - if (!env->ops->convert_ctx_access) { - int offset = access_size - 1; - - /* Allow zero-byte read from PTR_TO_CTX */ - if (access_size == 0) - return zero_size_allowed ? 0 : -EACCES; - - return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, - access_type, -1, false, false); + /* Only permit reading or writing syscall context using helper calls. */ + if (is_var_ctx_off_allowed(env->prog)) { + int err = check_mem_region_access(env, regno, 0, access_size, U16_MAX, + zero_size_allowed); + if (err) + return err; + if (env->prog->aux->max_ctx_offset < reg->umax_value + access_size) + env->prog->aux->max_ctx_offset = reg->umax_value + access_size; + return 0; } - fallthrough; default: /* scalar_value or invalid ptr */ /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && - register_is_null(reg)) + bpf_register_is_null(reg)) return 0; verbose(env, "R%d type=%s ", regno, @@ -8427,7 +7108,7 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg struct bpf_reg_state saved_reg; int err; - if (register_is_null(reg)) + if (bpf_register_is_null(reg)) return 0; /* Assuming that the register contains a value check if the memory @@ -8439,8 +7120,10 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg mark_ptr_not_null_reg(reg); } - err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL); - err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL); + int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size; + + err = check_helper_mem_access(env, regno, size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, regno, size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -8543,9 +7226,9 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) return -EINVAL; } spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off; - if (spin_lock_off != val + reg->off) { + if (spin_lock_off != val) { verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n", - val + reg->off, lock_str, spin_lock_off); + val, lock_str, spin_lock_off); return -EINVAL; } if (is_lock) { @@ -8660,9 +7343,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; } - if (field_off != val + reg->off) { + if (field_off != val) { verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n", - val + reg->off, struct_name, field_off); + val, struct_name, field_off); return -EINVAL; } if (map_desc->ptr) { @@ -8730,7 +7413,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } - kptr_off = reg->off + reg->var_off.value; + kptr_off = reg->var_off.value; kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR); if (!kptr_field) { verbose(env, "off=%d doesn't point to kptr\n", kptr_off); @@ -8851,7 +7534,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); return state->stack[spi].spilled_ptr.ref_obj_id; } @@ -8866,10 +7549,6 @@ static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_ITER_NEW; } -static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_ITER_NEXT; -} static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta) { @@ -8987,7 +7666,7 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env, struct list_head *pos, *head; /* Explored states are pushed in stack order, most recent states come first */ - head = explored_state(env, insn_idx); + head = bpf_explored_state(env, insn_idx); list_for_each(pos, head) { sl = container_of(pos, struct bpf_verifier_state_list, node); /* If st->branches != 0 state is a part of current DFS verification path, @@ -9002,11 +7681,6 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env, return NULL; } -static void reset_idmap_scratch(struct bpf_verifier_env *env); -static bool regs_exact(const struct bpf_reg_state *rold, - const struct bpf_reg_state *rcur, - struct bpf_idmap *idmap); - /* * Check if scalar registers are exact for the purpose of not widening. * More lenient than regs_exact() @@ -9048,8 +7722,8 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, num_slots = min(fold->allocated_stack / BPF_REG_SIZE, fcur->allocated_stack / BPF_REG_SIZE); for (i = 0; i < num_slots; i++) { - if (!is_spilled_reg(&fold->stack[i]) || - !is_spilled_reg(&fcur->stack[i])) + if (!bpf_is_spilled_reg(&fold->stack[i]) || + !bpf_is_spilled_reg(&fcur->stack[i])) continue; maybe_widen_reg(env, @@ -9293,6 +7967,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MEM | MEM_RINGBUF, PTR_TO_BUF, PTR_TO_BTF_ID | PTR_TRUSTED, + PTR_TO_CTX, }, }; @@ -9329,7 +8004,9 @@ static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } static const struct bpf_reg_types kptr_xchg_dest_types = { .types = { PTR_TO_MAP_VALUE, - PTR_TO_BTF_ID | MEM_ALLOC + PTR_TO_BTF_ID | MEM_ALLOC, + PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF, + PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU, } }; static const struct bpf_reg_types dynptr_types = { @@ -9373,7 +8050,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_reg_type expected, type = reg->type; const struct bpf_reg_types *compatible; - int i, j; + int i, j, err; compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { @@ -9476,8 +8153,12 @@ found: return -EACCES; } - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id, + err = __check_ptr_off_reg(env, reg, regno, true); + if (err) + return err; + + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, + reg->var_off.value, btf_vmlinux, *arg_btf_id, strict_type_match)) { verbose(env, "R%d is of type %s but %s is expected\n", regno, btf_type_name(reg->btf, reg->btf_id), @@ -9489,6 +8170,8 @@ found: } case PTR_TO_BTF_ID | MEM_ALLOC: case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC: + case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF: + case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF | MEM_RCU: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && meta->func_id != BPF_FUNC_kptr_xchg) { verifier_bug(env, "unimplemented handling of MEM_ALLOC"); @@ -9555,12 +8238,11 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * because fixed_off_ok is false, but checking here allows us * to give the user a better error message. */ - if (reg->off) { + if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) { verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", regno); return -EINVAL; } - return __check_ptr_off_reg(env, reg, regno, false); } switch (type) { @@ -9595,6 +8277,16 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * still need to do checks instead of returning. */ return __check_ptr_off_reg(env, reg, regno, true); + case PTR_TO_CTX: + /* + * Allow fixed and variable offsets for syscall context, but + * only when the argument is passed as memory, not ctx, + * otherwise we may get modified ctx in tail called programs and + * global subprogs (that may act as extension prog hooks). + */ + if (arg_type != ARG_PTR_TO_CTX && is_var_ctx_off_allowed(env->prog)) + return 0; + fallthrough; default: return __check_ptr_off_reg(env, reg, regno, false); } @@ -9624,7 +8316,7 @@ static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env, static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi; if (reg->type == CONST_PTR_TO_DYNPTR) @@ -9637,7 +8329,7 @@ static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi; if (reg->type == CONST_PTR_TO_DYNPTR) @@ -9651,13 +8343,13 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi; if (reg->type == CONST_PTR_TO_DYNPTR) return reg->dynptr.type; - spi = __get_spi(reg->off); + spi = bpf_get_spi(reg->var_off.value); if (spi < 0) { verbose(env, "verifier internal error: invalid spi when querying dynptr type\n"); return BPF_DYNPTR_TYPE_INVALID; @@ -9698,13 +8390,13 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EACCES; } - err = check_map_access(env, regno, reg->off, - map->value_size - reg->off, false, + err = check_map_access(env, regno, 0, + map->value_size - reg->var_off.value, false, ACCESS_HELPER); if (err) return err; - map_off = reg->off + reg->var_off.value; + map_off = reg->var_off.value; err = map->ops->map_direct_value_addr(map, &map_addr, map_off); if (err) { verbose(env, "direct value access on string failed\n"); @@ -9725,7 +8417,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, u32 key_size, s64 *value) { - struct bpf_func_state *state = func(env, key); + struct bpf_func_state *state = bpf_func(env, key); struct bpf_reg_state *reg; int slot, spi, off; int spill_size = 0; @@ -9741,7 +8433,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, if (!tnum_is_const(key->var_off)) return -EOPNOTSUPP; - stack_off = key->off + key->var_off.value; + stack_off = key->var_off.value; slot = -stack_off - 1; spi = slot / BPF_REG_SIZE; off = slot % BPF_REG_SIZE; @@ -9756,7 +8448,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, } /* Check that stack contains a scalar spill of expected size */ - if (!is_spilled_scalar_reg(&state->stack[spi])) + if (!bpf_is_spilled_scalar_reg(&state->stack[spi])) return -EOPNOTSUPP; for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--) spill_size++; @@ -9771,7 +8463,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, /* We are relying on a constant value. So mark as precise * to prevent pruning on it. */ - bt_set_frame_slot(&env->bt, key->frameno, spi); + bpf_bt_set_frame_slot(&env->bt, key->frameno, spi); err = mark_chain_precision_batch(env, env->cur_state); if (err < 0) return err; @@ -9823,7 +8515,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; } - if (register_is_null(reg) && type_may_be_null(arg_type)) + if (bpf_register_is_null(reg) && type_may_be_null(arg_type)) /* A NULL register has a SCALAR_VALUE type, so skip * type checking. */ @@ -9845,7 +8537,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, skip_type_check: if (arg_type_is_release(arg_type)) { if (arg_type_is_dynptr(arg_type)) { - struct bpf_func_state *state = func(env, reg); + struct bpf_func_state *state = bpf_func(env, reg); int spi; /* Only dynptr created on stack can be released, thus @@ -9863,7 +8555,7 @@ skip_type_check: verbose(env, "cannot release unowned const bpf_dynptr\n"); return -EINVAL; } - } else if (!reg->ref_obj_id && !register_is_null(reg)) { + } else if (!reg->ref_obj_id && !bpf_register_is_null(reg)) { verbose(env, "R%d must be referenced when passed to release function\n", regno); return -EINVAL; @@ -9942,7 +8634,7 @@ skip_type_check: } break; case ARG_PTR_TO_MAP_VALUE: - if (type_may_be_null(arg_type) && register_is_null(reg)) + if (type_may_be_null(arg_type) && bpf_register_is_null(reg)) return 0; /* bpf_map_xxx(..., map_ptr, ..., value) call: @@ -10090,7 +8782,7 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id) return false; } -static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env) +bool bpf_allow_tail_call_in_subprogs(struct bpf_verifier_env *env) { return env->prog->jit_requested && bpf_jit_supports_subprog_tailcalls(); @@ -10235,7 +8927,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; - if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) { + if (env->subprog_cnt > 1 && !bpf_allow_tail_call_in_subprogs(env)) { verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n"); return -EINVAL; } @@ -10547,7 +9239,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, /* after the call registers r0 - r5 were scratched */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, regs, caller_saved[i]); + bpf_mark_reg_not_init(env, ®s[caller_saved[i]]); __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK); } } @@ -10642,7 +9334,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * invalid memory access. */ } else if (arg->arg_type == ARG_PTR_TO_CTX) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); + ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_CTX); if (ret < 0) return ret; /* If function expects ctx type in BTF check that caller @@ -10686,7 +9378,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_call_arg_meta meta; int err; - if (register_is_null(reg) && type_may_be_null(arg->arg_type)) + if (bpf_register_is_null(reg) && type_may_be_null(arg->arg_type)) continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ @@ -10768,7 +9460,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins return -EFAULT; } - if (is_async_callback_calling_insn(insn)) { + if (bpf_is_async_callback_calling_insn(insn)) { struct bpf_verifier_state *async_cb; /* there is no real recursion here. timer and workqueue callbacks are async */ @@ -10815,7 +9507,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int err, subprog, target_insn; target_insn = *insn_idx + insn->imm + 1; - subprog = find_subprog(env, target_insn); + subprog = bpf_find_subprog(env, target_insn); if (verifier_bug_if(subprog < 0, env, "target of func call at insn %d is not a program", target_insn)) return -EFAULT; @@ -10824,7 +9516,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, err = btf_check_subprog_call(env, subprog, caller->regs); if (err == -EFAULT) return err; - if (subprog_is_global(env, subprog)) { + if (bpf_subprog_is_global(env, subprog)) { const char *sub_name = subprog_name(env, subprog); if (env->cur_state->active_locks) { @@ -10833,12 +9525,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } - if (env->subprog_info[subprog].might_sleep && - (env->cur_state->active_rcu_locks || env->cur_state->active_preempt_locks || - env->cur_state->active_irq_id || !in_sleepable(env))) { - verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" - "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" - "a non-sleepable BPF program context\n"); + if (env->subprog_info[subprog].might_sleep && !in_sleepable_context(env)) { + verbose(env, "sleepable global function %s() called in %s\n", + sub_name, non_sleepable_context_description(env)); return -EINVAL; } @@ -10857,9 +9546,11 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); - /* All global functions return a 64-bit SCALAR_VALUE */ - mark_reg_unknown(env, caller->regs, BPF_REG_0); - caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* All non-void global functions return a 64-bit SCALAR_VALUE. */ + if (!subprog_returns_void(env, subprog)) { + mark_reg_unknown(env, caller->regs, BPF_REG_0); + caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + } /* continue with next insn after call */ return 0; @@ -10877,8 +9568,6 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* and go analyze first insn of the callee */ *insn_idx = env->subprog_info[subprog].start - 1; - bpf_reset_live_stack_callchain(env); - if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); print_verifier_state(env, state, caller->frameno, true); @@ -10912,7 +9601,7 @@ int map_set_for_each_callback_args(struct bpf_verifier_env *env, callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3]; /* unused */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); return 0; } @@ -10969,9 +9658,9 @@ static int set_loop_callback_state(struct bpf_verifier_env *env, callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; /* unused */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; callee->callback_ret_range = retval_range(0, 1); @@ -11001,8 +9690,8 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, callee->regs[BPF_REG_3].map_ptr = map_ptr; /* unused */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; callee->callback_ret_range = retval_range(0, 0); return 0; @@ -11029,8 +9718,8 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4]; /* unused */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; callee->callback_ret_range = retval_range(0, 1); return 0; @@ -11045,14 +9734,14 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, * callback_ctx, u64 flags); * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx); */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_0]); mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL); callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; /* unused */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; callee->callback_ret_range = retval_range(0, 1); @@ -11073,7 +9762,8 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, */ struct btf_field *field; - field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off, + field = reg_find_field_offset(&caller->regs[BPF_REG_1], + caller->regs[BPF_REG_1].var_off.value, BPF_RB_ROOT); if (!field || !field->graph_root.value_btf_id) return -EFAULT; @@ -11083,9 +9773,9 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root); ref_set_non_owning(env, &callee->regs[BPF_REG_2]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_callback_fn = true; callee->callback_ret_range = retval_range(0, 1); return 0; @@ -11114,8 +9804,8 @@ static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env, callee->regs[BPF_REG_3].map_ptr = map_ptr; /* unused */ - __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); - __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + bpf_mark_reg_not_init(env, &callee->regs[BPF_REG_5]); callee->in_async_callback_fn = true; callee->callback_ret_range = retval_range(S32_MIN, S32_MAX); return 0; @@ -11146,10 +9836,9 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) return is_rbtree_lock_required_kfunc(kfunc_btf_id); } -static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg, - bool return_32bit) +static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) { - if (return_32bit) + if (range.return_32bit) return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval; else return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; @@ -11163,10 +9852,6 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) bool in_callback_fn; int err; - err = bpf_update_live_stack(env); - if (err) - return err; - callee = state->frame[state->curframe]; r0 = &callee->regs[BPF_REG_0]; if (r0->type == PTR_TO_STACK) { @@ -11193,7 +9878,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return err; /* enforce R0 return value range, and bpf_callback_t returns 64bit */ - if (!retval_range_within(callee->callback_ret_range, r0, false)) { + if (!retval_range_within(callee->callback_ret_range, r0)) { verbose_invalid_scalar(env, r0, callee->callback_ret_range, "At callback return", "R0"); return -EINVAL; @@ -11449,7 +10134,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const * and map_direct_value_addr is set. */ - fmt_map_off = fmt_reg->off + fmt_reg->var_off.value; + fmt_map_off = fmt_reg->var_off.value; err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, fmt_map_off); if (err) { @@ -11475,7 +10160,7 @@ static int check_get_func_ip(struct bpf_verifier_env *env) if (type == BPF_PROG_TYPE_TRACING) { if (!bpf_prog_has_trampoline(env->prog)) { - verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n", + verbose(env, "func %s#%d supported only for fentry/fexit/fsession/fmod_ret programs\n", func_id_name(func_id), func_id); return -ENOTSUPP; } @@ -11497,7 +10182,7 @@ static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env) static bool loop_flag_is_zero(struct bpf_verifier_env *env) { struct bpf_reg_state *reg = reg_state(env, BPF_REG_4); - bool reg_is_null = register_is_null(reg); + bool reg_is_null = bpf_register_is_null(reg); if (reg_is_null) mark_chain_precision(env, BPF_REG_4); @@ -11538,8 +10223,8 @@ static bool can_elide_value_nullness(enum bpf_map_type type) } } -static int get_helper_proto(struct bpf_verifier_env *env, int func_id, - const struct bpf_func_proto **ptr) +int bpf_get_helper_proto(struct bpf_verifier_env *env, int func_id, + const struct bpf_func_proto **ptr) { if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) return -ERANGE; @@ -11561,6 +10246,19 @@ static inline bool in_sleepable_context(struct bpf_verifier_env *env) in_sleepable(env); } +static const char *non_sleepable_context_description(struct bpf_verifier_env *env) +{ + if (env->cur_state->active_rcu_locks) + return "rcu_read_lock region"; + if (env->cur_state->active_preempt_locks) + return "non-preemptible region"; + if (env->cur_state->active_irq_id) + return "IRQ-disabled region"; + if (env->cur_state->active_locks) + return "lock region"; + return "non-sleepable prog"; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -11577,7 +10275,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* find function prototype */ func_id = insn->imm; - err = get_helper_proto(env, insn->imm, &fn); + err = bpf_get_helper_proto(env, insn->imm, &fn); if (err == -ERANGE) { verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; @@ -11600,11 +10298,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } - if (!in_sleepable(env) && fn->might_sleep) { - verbose(env, "helper call might sleep in a non-sleepable prog\n"); - return -EINVAL; - } - /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(func_id); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { @@ -11621,28 +10314,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (env->cur_state->active_rcu_locks) { - if (fn->might_sleep) { - verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n", - func_id_name(func_id), func_id); - return -EINVAL; - } - } - - if (env->cur_state->active_preempt_locks) { - if (fn->might_sleep) { - verbose(env, "sleepable helper %s#%d in non-preemptible region\n", - func_id_name(func_id), func_id); - return -EINVAL; - } - } - - if (env->cur_state->active_irq_id) { - if (fn->might_sleep) { - verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n", - func_id_name(func_id), func_id); - return -EINVAL; - } + if (fn->might_sleep && !in_sleepable_context(env)) { + verbose(env, "sleepable helper %s#%d in %s\n", func_id_name(func_id), func_id, + non_sleepable_context_description(env)); + return -EINVAL; } /* Track non-sleepable context for helpers. */ @@ -11703,7 +10378,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } else if (meta.ref_obj_id) { err = release_reference(env, meta.ref_obj_id); - } else if (register_is_null(®s[meta.release_regno])) { + } else if (bpf_register_is_null(®s[meta.release_regno])) { /* meta.ref_obj_id can only be 0 if register that is meant to be * released is NULL, which must be > R0. */ @@ -11726,7 +10401,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* check that flags argument in get_local_storage(map, flags) is 0, * this is required because get_local_storage() can't return an error. */ - if (!register_is_null(®s[BPF_REG_2])) { + if (!bpf_register_is_null(®s[BPF_REG_2])) { verbose(env, "get_local_storage() doesn't support non-zero flags\n"); return -EINVAL; } @@ -11869,7 +10544,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, regs, caller_saved[i]); + bpf_mark_reg_not_init(env, ®s[caller_saved[i]]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } @@ -12131,10 +10806,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RELEASE; } -static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_SLEEPABLE; -} static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta) { @@ -12355,6 +11026,28 @@ static bool is_kfunc_arg_prog_aux(const struct btf *btf, const struct btf_param return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_PROG_AUX_ID); } +/* + * A kfunc with KF_IMPLICIT_ARGS has two prototypes in BTF: + * - the _impl prototype with full arg list (meta->func_proto) + * - the BPF API prototype w/o implicit args (func->type in BTF) + * To determine whether an argument is implicit, we compare its position + * against the number of arguments in the prototype w/o implicit args. + */ +static bool is_kfunc_arg_implicit(const struct bpf_kfunc_call_arg_meta *meta, u32 arg_idx) +{ + const struct btf_type *func, *func_proto; + u32 argn; + + if (!(meta->kfunc_flags & KF_IMPLICIT_ARGS)) + return false; + + func = btf_type_by_id(meta->btf, meta->func_id); + func_proto = btf_type_by_id(meta->btf, func->type); + argn = btf_type_vlen(func_proto); + + return argn <= arg_idx; +} + /* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, const struct btf *btf, @@ -12421,10 +11114,15 @@ enum kfunc_ptr_arg_type { enum special_kfunc_type { KF_bpf_obj_new_impl, + KF_bpf_obj_new, KF_bpf_obj_drop_impl, + KF_bpf_obj_drop, KF_bpf_refcount_acquire_impl, + KF_bpf_refcount_acquire, KF_bpf_list_push_front_impl, + KF_bpf_list_push_front, KF_bpf_list_push_back_impl, + KF_bpf_list_push_back, KF_bpf_list_pop_front, KF_bpf_list_pop_back, KF_bpf_list_front, @@ -12435,6 +11133,7 @@ enum special_kfunc_type { KF_bpf_rcu_read_unlock, KF_bpf_rbtree_remove, KF_bpf_rbtree_add_impl, + KF_bpf_rbtree_add, KF_bpf_rbtree_first, KF_bpf_rbtree_root, KF_bpf_rbtree_left, @@ -12447,7 +11146,9 @@ enum special_kfunc_type { KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, KF_bpf_percpu_obj_new_impl, + KF_bpf_percpu_obj_new, KF_bpf_percpu_obj_drop_impl, + KF_bpf_percpu_obj_drop, KF_bpf_throw, KF_bpf_wq_set_callback, KF_bpf_preempt_disable, @@ -12481,10 +11182,15 @@ enum special_kfunc_type { BTF_ID_LIST(special_kfunc_list) BTF_ID(func, bpf_obj_new_impl) +BTF_ID(func, bpf_obj_new) BTF_ID(func, bpf_obj_drop_impl) +BTF_ID(func, bpf_obj_drop) BTF_ID(func, bpf_refcount_acquire_impl) +BTF_ID(func, bpf_refcount_acquire) BTF_ID(func, bpf_list_push_front_impl) +BTF_ID(func, bpf_list_push_front) BTF_ID(func, bpf_list_push_back_impl) +BTF_ID(func, bpf_list_push_back) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) BTF_ID(func, bpf_list_front) @@ -12495,6 +11201,7 @@ BTF_ID(func, bpf_rcu_read_lock) BTF_ID(func, bpf_rcu_read_unlock) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) +BTF_ID(func, bpf_rbtree_add) BTF_ID(func, bpf_rbtree_first) BTF_ID(func, bpf_rbtree_root) BTF_ID(func, bpf_rbtree_left) @@ -12514,7 +11221,9 @@ BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) BTF_ID(func, bpf_percpu_obj_new_impl) +BTF_ID(func, bpf_percpu_obj_new) BTF_ID(func, bpf_percpu_obj_drop_impl) +BTF_ID(func, bpf_percpu_obj_drop) BTF_ID(func, bpf_throw) BTF_ID(func, bpf_wq_set_callback) BTF_ID(func, bpf_preempt_disable) @@ -12558,6 +11267,50 @@ BTF_ID(func, bpf_session_is_return) BTF_ID(func, bpf_stream_vprintk) BTF_ID(func, bpf_stream_print_stack) +static bool is_bpf_obj_new_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_obj_new] || + func_id == special_kfunc_list[KF_bpf_obj_new_impl]; +} + +static bool is_bpf_percpu_obj_new_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_percpu_obj_new] || + func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]; +} + +static bool is_bpf_obj_drop_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_obj_drop] || + func_id == special_kfunc_list[KF_bpf_obj_drop_impl]; +} + +static bool is_bpf_percpu_obj_drop_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_percpu_obj_drop] || + func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]; +} + +static bool is_bpf_refcount_acquire_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_refcount_acquire] || + func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; +} + +static bool is_bpf_list_push_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_list_push_front] || + func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || + func_id == special_kfunc_list[KF_bpf_list_push_back] || + func_id == special_kfunc_list[KF_bpf_list_push_back_impl]; +} + +static bool is_bpf_rbtree_add_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_rbtree_add] || + func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; +} + static bool is_task_work_add_kfunc(u32 func_id) { return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || @@ -12566,10 +11319,8 @@ static bool is_task_work_add_kfunc(u32 func_id) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { - if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && - meta->arg_owning_ref) { + if (is_bpf_refcount_acquire_kfunc(meta->func_id) && meta->arg_owning_ref) return false; - } return meta->kfunc_flags & KF_RET_NULL; } @@ -12594,7 +11345,7 @@ static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta) return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable]; } -static bool is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta) +bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta) { return meta->func_id == special_kfunc_list[KF_bpf_xdp_pull_data]; } @@ -12629,7 +11380,7 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) return KF_ARG_PTR_TO_CTX; - if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg) && + if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && bpf_register_is_null(reg) && !arg_mem_size) return KF_ARG_PTR_TO_NULL; @@ -12755,13 +11506,12 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; - WARN_ON_ONCE(is_kfunc_release(meta) && - (reg->off || !tnum_is_const(reg->var_off) || - reg->var_off.value)); + WARN_ON_ONCE(is_kfunc_release(meta) && !tnum_is_const(reg->var_off)); reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id); reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); - struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match); + struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->var_off.value, + meta->btf, ref_id, strict_type_match); /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot * actually use it -- it must cast to the underlying type. So we allow * caller to pass in the underlying type. @@ -12958,8 +11708,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ static bool is_bpf_list_api_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || - btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] || + return is_bpf_list_push_kfunc(btf_id) || btf_id == special_kfunc_list[KF_bpf_list_pop_front] || btf_id == special_kfunc_list[KF_bpf_list_pop_back] || btf_id == special_kfunc_list[KF_bpf_list_front] || @@ -12968,7 +11717,7 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) static bool is_bpf_rbtree_api_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || + return is_bpf_rbtree_add_kfunc(btf_id) || btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || btf_id == special_kfunc_list[KF_bpf_rbtree_first] || btf_id == special_kfunc_list[KF_bpf_rbtree_root] || @@ -12985,8 +11734,9 @@ static bool is_bpf_iter_num_api_kfunc(u32 btf_id) static bool is_bpf_graph_api_kfunc(u32 btf_id) { - return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) || - btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; + return is_bpf_list_api_kfunc(btf_id) || + is_bpf_rbtree_api_kfunc(btf_id) || + is_bpf_refcount_acquire_kfunc(btf_id); } static bool is_bpf_res_spin_lock_kfunc(u32 btf_id) @@ -13019,7 +11769,7 @@ static bool kfunc_spin_allowed(u32 btf_id) static bool is_sync_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; + return is_bpf_rbtree_add_kfunc(btf_id); } static bool is_async_callback_calling_kfunc(u32 btf_id) @@ -13083,12 +11833,11 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, switch (node_field_type) { case BPF_LIST_NODE: - ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] || - kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]); + ret = is_bpf_list_push_kfunc(kfunc_btf_id); break; case BPF_RB_NODE: - ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || - kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] || + ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_left] || kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_right]); break; @@ -13133,7 +11882,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, } rec = reg_btf_record(reg); - head_off = reg->off + reg->var_off.value; + head_off = reg->var_off.value; field = btf_record_find(rec, head_off, head_field_type); if (!field) { verbose(env, "%s not found at offset=%u\n", head_type_name, head_off); @@ -13200,7 +11949,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, return -EINVAL; } - node_off = reg->off + reg->var_off.value; + node_off = reg->var_off.value; field = reg_find_field_offset(reg, node_off, node_field_type); if (!field) { verbose(env, "%s not found at offset=%u\n", node_type_name, node_off); @@ -13305,11 +12054,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ bool is_ret_buf_sz = false; int kf_arg_type; - t = btf_type_skip_modifiers(btf, args[i].type, NULL); - - if (is_kfunc_arg_ignore(btf, &args[i])) - continue; - if (is_kfunc_arg_prog_aux(btf, &args[i])) { /* Reject repeated use bpf_prog_aux */ if (meta->arg_prog) { @@ -13321,6 +12065,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ continue; } + if (is_kfunc_arg_ignore(btf, &args[i]) || is_kfunc_arg_implicit(meta, i)) + continue; + + t = btf_type_skip_modifiers(btf, args[i].type, NULL); + if (btf_type_is_scalar(t)) { if (reg->type != SCALAR_VALUE) { verbose(env, "R%d is not a scalar\n", regno); @@ -13372,7 +12121,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } - if ((register_is_null(reg) || type_may_be_null(reg->type)) && + if ((bpf_register_is_null(reg) || type_may_be_null(reg->type)) && !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); return -EACCES; @@ -13449,7 +12198,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } fallthrough; - case KF_ARG_PTR_TO_CTX: case KF_ARG_PTR_TO_DYNPTR: case KF_ARG_PTR_TO_ITER: case KF_ARG_PTR_TO_LIST_HEAD: @@ -13467,6 +12215,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; + case KF_ARG_PTR_TO_CTX: + arg_type = ARG_PTR_TO_CTX; + break; default: verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type); return -EFAULT; @@ -13495,13 +12246,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_ALLOC_BTF_ID: if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) { - if (meta->func_id != special_kfunc_list[KF_bpf_obj_drop_impl]) { - verbose(env, "arg#%d expected for bpf_obj_drop_impl()\n", i); + if (!is_bpf_obj_drop_kfunc(meta->func_id)) { + verbose(env, "arg#%d expected for bpf_obj_drop()\n", i); return -EINVAL; } } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) { - if (meta->func_id != special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { - verbose(env, "arg#%d expected for bpf_percpu_obj_drop_impl()\n", i); + if (!is_bpf_percpu_obj_drop_kfunc(meta->func_id)) { + verbose(env, "arg#%d expected for bpf_percpu_obj_drop()\n", i); return -EINVAL; } } else { @@ -13627,7 +12378,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return ret; break; case KF_ARG_PTR_TO_RB_NODE: - if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + if (is_bpf_rbtree_add_kfunc(meta->func_id)) { if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { verbose(env, "arg#%d expected pointer to allocated object\n", i); return -EINVAL; @@ -13690,7 +12441,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ struct bpf_reg_state *size_reg = ®s[regno + 1]; const struct btf_param *size_arg = &args[i + 1]; - if (!register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { + if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); if (ret < 0) { verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); @@ -13823,10 +12574,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return 0; } -static int fetch_kfunc_arg_meta(struct bpf_verifier_env *env, - s32 func_id, - s16 offset, - struct bpf_kfunc_call_arg_meta *meta) +int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env, + s32 func_id, + s16 offset, + struct bpf_kfunc_call_arg_meta *meta) { struct bpf_kfunc_meta kfunc; int err; @@ -13849,6 +12600,194 @@ static int fetch_kfunc_arg_meta(struct bpf_verifier_env *env, return 0; } +/* + * Determine how many bytes a helper accesses through a stack pointer at + * argument position @arg (0-based, corresponding to R1-R5). + * + * Returns: + * > 0 known read access size in bytes + * 0 doesn't read anything directly + * S64_MIN unknown + * < 0 known write access of (-return) bytes + */ +s64 bpf_helper_stack_access_bytes(struct bpf_verifier_env *env, struct bpf_insn *insn, + int arg, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + const struct bpf_func_proto *fn; + enum bpf_arg_type at; + s64 size; + + if (bpf_get_helper_proto(env, insn->imm, &fn) < 0) + return S64_MIN; + + at = fn->arg_type[arg]; + + switch (base_type(at)) { + case ARG_PTR_TO_MAP_KEY: + case ARG_PTR_TO_MAP_VALUE: { + bool is_key = base_type(at) == ARG_PTR_TO_MAP_KEY; + u64 val; + int i, map_reg; + + for (i = 0; i < arg; i++) { + if (base_type(fn->arg_type[i]) == ARG_CONST_MAP_PTR) + break; + } + if (i >= arg) + goto scan_all_maps; + + map_reg = BPF_REG_1 + i; + + if (!(aux->const_reg_map_mask & BIT(map_reg))) + goto scan_all_maps; + + i = aux->const_reg_vals[map_reg]; + if (i < env->used_map_cnt) { + size = is_key ? env->used_maps[i]->key_size + : env->used_maps[i]->value_size; + goto out; + } +scan_all_maps: + /* + * Map pointer is not known at this call site (e.g. different + * maps on merged paths). Conservatively return the largest + * key_size or value_size across all maps used by the program. + */ + val = 0; + for (i = 0; i < env->used_map_cnt; i++) { + struct bpf_map *map = env->used_maps[i]; + u32 sz = is_key ? map->key_size : map->value_size; + + if (sz > val) + val = sz; + if (map->inner_map_meta) { + sz = is_key ? map->inner_map_meta->key_size + : map->inner_map_meta->value_size; + if (sz > val) + val = sz; + } + } + if (!val) + return S64_MIN; + size = val; + goto out; + } + case ARG_PTR_TO_MEM: + if (at & MEM_FIXED_SIZE) { + size = fn->arg_size[arg]; + goto out; + } + if (arg + 1 < ARRAY_SIZE(fn->arg_type) && + arg_type_is_mem_size(fn->arg_type[arg + 1])) { + int size_reg = BPF_REG_1 + arg + 1; + + if (aux->const_reg_mask & BIT(size_reg)) { + size = (s64)aux->const_reg_vals[size_reg]; + goto out; + } + /* + * Size arg is const on each path but differs across merged + * paths. MAX_BPF_STACK is a safe upper bound for reads. + */ + if (at & MEM_UNINIT) + return 0; + return MAX_BPF_STACK; + } + return S64_MIN; + case ARG_PTR_TO_DYNPTR: + size = BPF_DYNPTR_SIZE; + break; + case ARG_PTR_TO_STACK: + /* + * Only used by bpf_calls_callback() helpers. The helper itself + * doesn't access stack. The callback subprog does and it's + * analyzed separately. + */ + return 0; + default: + return S64_MIN; + } +out: + /* + * MEM_UNINIT args are write-only: the helper initializes the + * buffer without reading it. + */ + if (at & MEM_UNINIT) + return -size; + return size; +} + +/* + * Determine how many bytes a kfunc accesses through a stack pointer at + * argument position @arg (0-based, corresponding to R1-R5). + * + * Returns: + * > 0 known read access size in bytes + * 0 doesn't access memory through that argument (ex: not a pointer) + * S64_MIN unknown + * < 0 known write access of (-return) bytes + */ +s64 bpf_kfunc_stack_access_bytes(struct bpf_verifier_env *env, struct bpf_insn *insn, + int arg, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + struct bpf_kfunc_call_arg_meta meta; + const struct btf_param *args; + const struct btf_type *t, *ref_t; + const struct btf *btf; + u32 nargs, type_size; + s64 size; + + if (bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta) < 0) + return S64_MIN; + + btf = meta.btf; + args = btf_params(meta.func_proto); + nargs = btf_type_vlen(meta.func_proto); + if (arg >= nargs) + return 0; + + t = btf_type_skip_modifiers(btf, args[arg].type, NULL); + if (!btf_type_is_ptr(t)) + return 0; + + /* dynptr: fixed 16-byte on-stack representation */ + if (is_kfunc_arg_dynptr(btf, &args[arg])) { + size = BPF_DYNPTR_SIZE; + goto out; + } + + /* ptr + __sz/__szk pair: size is in the next register */ + if (arg + 1 < nargs && + (btf_param_match_suffix(btf, &args[arg + 1], "__sz") || + btf_param_match_suffix(btf, &args[arg + 1], "__szk"))) { + int size_reg = BPF_REG_1 + arg + 1; + + if (aux->const_reg_mask & BIT(size_reg)) { + size = (s64)aux->const_reg_vals[size_reg]; + goto out; + } + return MAX_BPF_STACK; + } + + /* fixed-size pointed-to type: resolve via BTF */ + ref_t = btf_type_skip_modifiers(btf, t->type, NULL); + if (!IS_ERR(btf_resolve_size(btf, ref_t, &type_size))) { + size = type_size; + goto out; + } + + return S64_MIN; +out: + /* KF_ITER_NEW kfuncs initialize the iterator state at arg 0 */ + if (arg == 0 && meta.kfunc_flags & KF_ITER_NEW) + return -size; + if (is_kfunc_arg_uninit(btf, &args[arg])) + return -size; + return size; +} + /* check special kfuncs and return: * 1 - not fall-through to 'else' branch, continue verification * 0 - fall-through to 'else' branch @@ -13864,13 +12803,12 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca if (meta->btf != btf_vmlinux) return 0; - if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || - meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (is_bpf_obj_new_kfunc(meta->func_id) || is_bpf_percpu_obj_new_kfunc(meta->func_id)) { struct btf_struct_meta *struct_meta; struct btf *ret_btf; u32 ret_btf_id; - if (meta->func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set) + if (is_bpf_obj_new_kfunc(meta->func_id) && !bpf_global_ma_set) return -ENOMEM; if (((u64)(u32)meta->arg_constant.value) != meta->arg_constant.value) { @@ -13893,7 +12831,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca return -EINVAL; } - if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (is_bpf_percpu_obj_new_kfunc(meta->func_id)) { if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) { verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n", ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE); @@ -13923,7 +12861,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); - if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { + if (is_bpf_percpu_obj_new_kfunc(meta->func_id)) { if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) { verbose(env, "bpf_percpu_obj_new type ID argument must be of a struct of scalars\n"); return -EINVAL; @@ -13939,12 +12877,12 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; - if (meta->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) + if (is_bpf_percpu_obj_new_kfunc(meta->func_id)) regs[BPF_REG_0].type |= MEM_PERCPU; insn_aux->obj_new_size = ret_t->size; insn_aux->kptr_struct_meta = struct_meta; - } else if (meta->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { + } else if (is_bpf_refcount_acquire_kfunc(meta->func_id)) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; regs[BPF_REG_0].btf = meta->arg_btf; @@ -14030,6 +12968,8 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name); +static int process_bpf_exit_full(struct bpf_verifier_env *env, + bool *do_print_state, bool exception_exit); static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) @@ -14049,7 +12989,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (!insn->imm) return 0; - err = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta); + err = bpf_fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta); if (err == -EACCES && meta.func_name) verbose(env, "calling kernel function %s is not allowed\n", meta.func_name); if (err) @@ -14058,7 +12998,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, func_name = meta.func_name; insn_aux = &env->insn_aux_data[insn_idx]; - insn_aux->is_iter_next = is_iter_next_kfunc(&meta); + insn_aux->is_iter_next = bpf_is_iter_next_kfunc(&meta); if (!insn->off && (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] || @@ -14076,7 +13016,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Clear r0-r5 registers in forked state */ for (i = 0; i < CALLER_SAVED_REGS; i++) - mark_reg_not_init(env, regs, caller_saved[i]); + bpf_mark_reg_not_init(env, ®s[caller_saved[i]]); mark_reg_unknown(env, regs, BPF_REG_0); err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1); @@ -14095,7 +13035,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EACCES; } - sleepable = is_kfunc_sleepable(&meta); + sleepable = bpf_is_kfunc_sleepable(&meta); if (sleepable && !in_sleepable(env)) { verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name); return -EACCES; @@ -14110,7 +13050,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (err < 0) return err; - if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + if (is_bpf_rbtree_add_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_rbtree_add_callback_state); if (err) { @@ -14170,34 +13110,24 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } })); } - } else if (sleepable && env->cur_state->active_rcu_locks) { - verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); - return -EACCES; - } - - if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { - verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); - return -EACCES; - } - - if (env->cur_state->active_preempt_locks) { - if (preempt_disable) { - env->cur_state->active_preempt_locks++; - } else if (preempt_enable) { - env->cur_state->active_preempt_locks--; - } else if (sleepable) { - verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name); - return -EACCES; - } } else if (preempt_disable) { env->cur_state->active_preempt_locks++; } else if (preempt_enable) { - verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); - return -EINVAL; + if (env->cur_state->active_preempt_locks == 0) { + verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); + return -EINVAL; + } + env->cur_state->active_preempt_locks--; } - if (env->cur_state->active_irq_id && sleepable) { - verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name); + if (sleepable && !in_sleepable_context(env)) { + verbose(env, "kernel func %s is sleepable within %s\n", + func_name, non_sleepable_context_description(env)); + return -EACCES; + } + + if (in_rbtree_lock_required_cb(env) && (rcu_lock || rcu_unlock)) { + verbose(env, "Calling bpf_rcu_read_{lock,unlock} in unnecessary rbtree callback\n"); return -EACCES; } @@ -14224,11 +13154,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return err; } - if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || - meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || - meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { + if (is_bpf_list_push_kfunc(meta.func_id) || is_bpf_rbtree_add_kfunc(meta.func_id)) { release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; - insn_aux->insert_off = regs[BPF_REG_2].off; + insn_aux->insert_off = regs[BPF_REG_2].var_off.value; insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); err = ref_convert_owning_non_owning(env, release_ref_obj_id); if (err) { @@ -14266,7 +13194,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, for (i = 0; i < CALLER_SAVED_REGS; i++) { u32 regno = caller_saved[i]; - mark_reg_not_init(env, regs, regno); + bpf_mark_reg_not_init(env, ®s[regno]); regs[regno].subreg_def = DEF_NOT_SUBREG; } @@ -14274,11 +13202,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) { - /* Only exception is bpf_obj_new_impl */ if (meta.btf != btf_vmlinux || - (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] && - meta.func_id != special_kfunc_list[KF_bpf_percpu_obj_new_impl] && - meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) { + (!is_bpf_obj_new_kfunc(meta.func_id) && + !is_bpf_percpu_obj_new_kfunc(meta.func_id) && + !is_bpf_refcount_acquire_kfunc(meta.func_id))) { verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); return -EINVAL; } @@ -14338,7 +13265,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_get_kmem_cache]) type |= PTR_UNTRUSTED; else if (is_kfunc_rcu_protected(&meta) || - (is_iter_next_kfunc(&meta) && + (bpf_is_iter_next_kfunc(&meta) && (get_iter_from_state(env->cur_state, &meta) ->type & MEM_RCU))) { /* @@ -14389,8 +13316,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].id = ++env->id_gen; } else if (btf_type_is_void(t)) { if (meta.btf == btf_vmlinux) { - if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || - meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl]) { + if (is_bpf_obj_drop_kfunc(meta.func_id) || + is_bpf_percpu_obj_drop_kfunc(meta.func_id)) { insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); @@ -14398,7 +13325,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } - if (is_kfunc_pkt_changing(&meta)) + if (bpf_is_kfunc_pkt_changing(&meta)) clear_all_pkt_pointers(env); nargs = btf_type_vlen(meta.func_proto); @@ -14410,11 +13337,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (btf_type_is_ptr(t)) mark_btf_func_reg_size(env, regno, sizeof(void *)); else - /* scalar. ensured by btf_check_kfunc_arg_match() */ + /* scalar. ensured by check_kfunc_args() */ mark_btf_func_reg_size(env, regno, t->size); } - if (is_iter_next_kfunc(&meta)) { + if (bpf_is_iter_next_kfunc(&meta)) { err = process_iter_next_call(env, insn_idx, &meta); if (err) return err; @@ -14423,12 +13350,15 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) env->prog->call_session_cookie = true; + if (is_bpf_throw_kfunc(insn)) + return process_bpf_exit_full(env, NULL, true); + return 0; } -static bool check_reg_sane_offset(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - enum bpf_reg_type type) +static bool check_reg_sane_offset_scalar(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + enum bpf_reg_type type) { bool known = tnum_is_const(reg->var_off); s64 val = reg->var_off.value; @@ -14440,12 +13370,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return false; } - if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { - verbose(env, "%s pointer offset %d is not allowed\n", - reg_type_str(env, type), reg->off); - return false; - } - if (smin == S64_MIN) { verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", reg_type_str(env, type)); @@ -14461,6 +13385,29 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } +static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + enum bpf_reg_type type) +{ + bool known = tnum_is_const(reg->var_off); + s64 val = reg->var_off.value; + s64 smin = reg->smin_value; + + if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { + verbose(env, "%s pointer offset %lld is not allowed\n", + reg_type_str(env, type), val); + return false; + } + + if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { + verbose(env, "%s pointer offset %lld is not allowed\n", + reg_type_str(env, type), smin); + return false; + } + + return true; +} + enum { REASON_BOUNDS = -1, REASON_TYPE = -2, @@ -14482,13 +13429,11 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, * currently prohibited for unprivileged. */ max = MAX_BPF_STACK + mask_to_left; - ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off); + ptr_limit = -ptr_reg->var_off.value; break; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; - ptr_limit = (mask_to_left ? - ptr_reg->smin_value : - ptr_reg->umax_value) + ptr_reg->off; + ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value; break; default: return REASON_TYPE; @@ -14719,9 +13664,6 @@ static int sanitize_err(struct bpf_verifier_env *env, * Variable offset is prohibited for unprivileged mode for simplicity since it * requires corresponding support in Spectre masking for stack ALU. See also * retrieve_ptr_limit(). - * - * - * 'off' includes 'reg->off'. */ static int check_stack_access_for_ptr_arithmetic( struct bpf_verifier_env *env, @@ -14762,11 +13704,11 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, switch (dst_reg->type) { case PTR_TO_STACK: if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg, - dst_reg->off + dst_reg->var_off.value)) + dst_reg->var_off.value)) return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) { + if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; @@ -14874,8 +13816,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->type = ptr_reg->type; dst_reg->id = ptr_reg->id; - if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) || - !check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) + if (!check_reg_sane_offset_scalar(env, off_reg, ptr_reg->type) || + !check_reg_sane_offset_ptr(env, ptr_reg, ptr_reg->type)) return -EINVAL; /* pointer types do not carry 32-bit bounds at the moment. */ @@ -14890,23 +13832,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: - /* We can take a fixed offset as long as it doesn't overflow - * the s32 'off' field - */ - if (known && (ptr_reg->off + smin_val == - (s64)(s32)(ptr_reg->off + smin_val))) { - /* pointer += K. Accumulate it into fixed offset */ - dst_reg->smin_value = smin_ptr; - dst_reg->smax_value = smax_ptr; - dst_reg->umin_value = umin_ptr; - dst_reg->umax_value = umax_ptr; - dst_reg->var_off = ptr_reg->var_off; - dst_reg->off = ptr_reg->off + smin_val; - dst_reg->raw = ptr_reg->raw; - break; - } - /* A new variable offset is created. Note that off_reg->off - * == 0, since it's a scalar. + /* * dst_reg gets the pointer type and since some positive * integer value was added to the pointer, give it a new 'id' * if it's a PTR_TO_PACKET. @@ -14925,12 +13851,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = U64_MAX; } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); - dst_reg->off = ptr_reg->off; dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { - dst_reg->id = ++env->id_gen; - /* something was added to pkt_ptr, set range to zero */ - memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); + if (!known) + dst_reg->id = ++env->id_gen; + /* + * Clear range for unknown addends since we can't know + * where the pkt pointer ended up. Also clear AT_PKT_END / + * BEYOND_PKT_END from prior comparison as any pointer + * arithmetic invalidates them. + */ + if (!known || dst_reg->range < 0) + memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); } break; case BPF_SUB: @@ -14949,19 +13881,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst); return -EACCES; } - if (known && (ptr_reg->off - smin_val == - (s64)(s32)(ptr_reg->off - smin_val))) { - /* pointer -= K. Subtract it from fixed offset */ - dst_reg->smin_value = smin_ptr; - dst_reg->smax_value = smax_ptr; - dst_reg->umin_value = umin_ptr; - dst_reg->umax_value = umax_ptr; - dst_reg->var_off = ptr_reg->var_off; - dst_reg->id = ptr_reg->id; - dst_reg->off = ptr_reg->off - smin_val; - dst_reg->raw = ptr_reg->raw; - break; - } /* A new variable offset is created. If the subtrahend is known * nonnegative, then any reg->range we had before is still good. */ @@ -14981,12 +13900,18 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = umax_ptr - umin_val; } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); - dst_reg->off = ptr_reg->off; dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { - dst_reg->id = ++env->id_gen; - /* something was added to pkt_ptr, set range to zero */ - if (smin_val < 0) + if (!known) + dst_reg->id = ++env->id_gen; + /* + * Clear range if the subtrahend may be negative since + * pkt pointer could move past its bounds. A positive + * subtrahend moves it backwards keeping positive range + * intact. Also clear AT_PKT_END / BEYOND_PKT_END from + * prior comparison as arithmetic invalidates them. + */ + if ((!known && smin_val < 0) || dst_reg->range < 0) memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); } break; @@ -15004,7 +13929,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) + if (!check_reg_sane_offset_ptr(env, dst_reg, ptr_reg->type)) return -EINVAL; reg_bounds_sync(dst_reg); bounds_ret = sanitize_check_bounds(env, insn, dst_reg); @@ -15856,6 +14781,13 @@ static void scalar_byte_swap(struct bpf_reg_state *dst_reg, struct bpf_insn *ins /* Apply bswap if alu64 or switch between big-endian and little-endian machines */ bool need_bswap = alu64 || (to_le == is_big_endian); + /* + * If the register is mutated, manually reset its scalar ID to break + * any existing ties and avoid incorrect bounds propagation. + */ + if (need_bswap || insn->imm == 16 || insn->imm == 32) + clear_scalar_id(dst_reg); + if (need_bswap) { if (insn->imm == 16) dst_reg->var_off = tnum_bswap16(dst_reg->var_off); @@ -15938,7 +14870,7 @@ static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *ins else return 0; - branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + branch = push_stack(env, env->insn_idx, env->insn_idx, false); if (IS_ERR(branch)) return PTR_ERR(branch); @@ -16127,11 +15059,20 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, int err; dst_reg = ®s[insn->dst_reg]; - src_reg = NULL; + if (BPF_SRC(insn->code) == BPF_X) + src_reg = ®s[insn->src_reg]; + else + src_reg = NULL; - if (dst_reg->type == PTR_TO_ARENA) { + /* Case where at least one operand is an arena. */ + if (dst_reg->type == PTR_TO_ARENA || (src_reg && src_reg->type == PTR_TO_ARENA)) { struct bpf_insn_aux_data *aux = cur_aux(env); + if (dst_reg->type != PTR_TO_ARENA) + *dst_reg = *src_reg; + + dst_reg->subreg_def = env->insn_idx + 1; + if (BPF_CLASS(insn->code) == BPF_ALU64) /* * 32-bit operations zero upper bits automatically. @@ -16147,7 +15088,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, ptr_reg = dst_reg; if (BPF_SRC(insn->code) == BPF_X) { - src_reg = ®s[insn->src_reg]; if (src_reg->type != SCALAR_VALUE) { if (dst_reg->type != SCALAR_VALUE) { /* Combining two pointers by any ALU op yields @@ -16230,7 +15170,8 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, */ if (env->bpf_capable && (BPF_OP(insn->code) == BPF_ADD || BPF_OP(insn->code) == BPF_SUB) && - dst_reg->id && is_reg_const(src_reg, alu32)) { + dst_reg->id && is_reg_const(src_reg, alu32) && + !(BPF_SRC(insn->code) == BPF_X && insn->src_reg == insn->dst_reg)) { u64 val = reg_const_value(src_reg, alu32); s32 off; @@ -16255,21 +15196,20 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * we cannot accumulate another val into rx->off. */ clear_id: - dst_reg->off = 0; - dst_reg->id = 0; + clear_scalar_id(dst_reg); } else { if (alu32) dst_reg->id |= BPF_ADD_CONST32; else dst_reg->id |= BPF_ADD_CONST64; - dst_reg->off = off; + dst_reg->delta = off; } } else { /* * Make sure ID is cleared otherwise dst_reg min/max could be * incorrectly propagated into other registers by sync_linked_regs() */ - dst_reg->id = 0; + clear_scalar_id(dst_reg); } return 0; } @@ -16282,23 +15222,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) int err; if (opcode == BPF_END || opcode == BPF_NEG) { - if (opcode == BPF_NEG) { - if (BPF_SRC(insn->code) != BPF_K || - insn->src_reg != BPF_REG_0 || - insn->off != 0 || insn->imm != 0) { - verbose(env, "BPF_NEG uses reserved fields\n"); - return -EINVAL; - } - } else { - if (insn->src_reg != BPF_REG_0 || insn->off != 0 || - (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || - (BPF_CLASS(insn->code) == BPF_ALU64 && - BPF_SRC(insn->code) != BPF_TO_LE)) { - verbose(env, "BPF_END uses reserved fields\n"); - return -EINVAL; - } - } - /* check src operand */ err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) @@ -16311,8 +15234,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - if ((opcode == BPF_NEG || opcode == BPF_END) && - regs[insn->dst_reg].type == SCALAR_VALUE) { + if (regs[insn->dst_reg].type == SCALAR_VALUE) { err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); err = err ?: adjust_scalar_min_max_vals(env, insn, ®s[insn->dst_reg], @@ -16326,38 +15248,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (opcode == BPF_MOV) { if (BPF_SRC(insn->code) == BPF_X) { - if (BPF_CLASS(insn->code) == BPF_ALU) { - if ((insn->off != 0 && insn->off != 8 && insn->off != 16) || - insn->imm) { - verbose(env, "BPF_MOV uses reserved fields\n"); - return -EINVAL; - } - } else if (insn->off == BPF_ADDR_SPACE_CAST) { - if (insn->imm != 1 && insn->imm != 1u << 16) { - verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n"); - return -EINVAL; - } + if (insn->off == BPF_ADDR_SPACE_CAST) { if (!env->prog->aux->arena) { verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n"); return -EINVAL; } - } else { - if ((insn->off != 0 && insn->off != 8 && insn->off != 16 && - insn->off != 32) || insn->imm) { - verbose(env, "BPF_MOV uses reserved fields\n"); - return -EINVAL; - } } /* check src operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; - } else { - if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose(env, "BPF_MOV uses reserved fields\n"); - return -EINVAL; - } } /* check dest operand, mark as required later */ @@ -16400,7 +15301,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); if (!no_sext) - dst_reg->id = 0; + clear_scalar_id(dst_reg); coerce_reg_to_size_sx(dst_reg, insn->off >> 3); dst_reg->subreg_def = DEF_NOT_SUBREG; } else { @@ -16426,7 +15327,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * propagated into src_reg by sync_linked_regs() */ if (!is_src_reg_u32) - dst_reg->id = 0; + clear_scalar_id(dst_reg); dst_reg->subreg_def = env->insn_idx + 1; } else { /* case: W1 = (s8, s16)W2 */ @@ -16436,7 +15337,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); if (!no_sext) - dst_reg->id = 0; + clear_scalar_id(dst_reg); dst_reg->subreg_def = env->insn_idx + 1; coerce_subreg_to_size_sx(dst_reg, insn->off >> 3); } @@ -16463,28 +15364,13 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } - } else if (opcode > BPF_END) { - verbose(env, "invalid BPF_ALU opcode %x\n", opcode); - return -EINVAL; - } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) || - (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { - verbose(env, "BPF_ALU uses reserved fields\n"); - return -EINVAL; - } /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) return err; - } else { - if (insn->src_reg != BPF_REG_0 || (insn->off != 0 && insn->off != 1) || - (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { - verbose(env, "BPF_ALU uses reserved fields\n"); - return -EINVAL; - } } /* check src2 operand */ @@ -16527,19 +15413,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *reg; int new_range; - if (dst_reg->off < 0 || - (dst_reg->off == 0 && range_right_open)) + if (dst_reg->umax_value == 0 && range_right_open) /* This doesn't give us any range */ return; - if (dst_reg->umax_value > MAX_PACKET_OFF || - dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF) + if (dst_reg->umax_value > MAX_PACKET_OFF) /* Risk of overflow. For instance, ptr + (1<<63) may be less * than pkt_end, but that's because it's also less than pkt. */ return; - new_range = dst_reg->off; + new_range = dst_reg->umax_value; if (range_right_open) new_range++; @@ -16588,7 +15472,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, /* If our ids match, then we must have the same max_value. And we * don't care about the other reg's fixed offset, since if it's too big * the range won't allow anything. - * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. + * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16. */ bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == type && reg->id == dst_reg->id) @@ -16597,11 +15481,50 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, })); } +static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, + u8 opcode, bool is_jmp32); +static u8 rev_opcode(u8 opcode); + +/* + * Learn more information about live branches by simulating refinement on both branches. + * regs_refine_cond_op() is sound, so producing ill-formed register bounds for the branch means + * that branch is dead. + */ +static int simulate_both_branches_taken(struct bpf_verifier_env *env, u8 opcode, bool is_jmp32) +{ + /* Fallthrough (FALSE) branch */ + regs_refine_cond_op(&env->false_reg1, &env->false_reg2, rev_opcode(opcode), is_jmp32); + reg_bounds_sync(&env->false_reg1); + reg_bounds_sync(&env->false_reg2); + /* + * If there is a range bounds violation in *any* of the abstract values in either + * reg_states in the FALSE branch (i.e. reg1, reg2), the FALSE branch must be dead. Only + * TRUE branch will be taken. + */ + if (range_bounds_violation(&env->false_reg1) || range_bounds_violation(&env->false_reg2)) + return 1; + + /* Jump (TRUE) branch */ + regs_refine_cond_op(&env->true_reg1, &env->true_reg2, opcode, is_jmp32); + reg_bounds_sync(&env->true_reg1); + reg_bounds_sync(&env->true_reg2); + /* + * If there is a range bounds violation in *any* of the abstract values in either + * reg_states in the TRUE branch (i.e. true_reg1, true_reg2), the TRUE branch must be dead. + * Only FALSE branch will be taken. + */ + if (range_bounds_violation(&env->true_reg1) || range_bounds_violation(&env->true_reg2)) + return 0; + + /* Both branches are possible, we can't determine which one will be taken. */ + return -1; +} + /* * <reg1> <op> <reg2>, currently assuming reg2 is a constant */ -static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, - u8 opcode, bool is_jmp32) +static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *reg1, + struct bpf_reg_state *reg2, u8 opcode, bool is_jmp32) { struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off; struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; @@ -16753,7 +15676,7 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta break; } - return -1; + return simulate_both_branches_taken(env, opcode, is_jmp32); } static int flip_opcode(u32 opcode) @@ -16824,8 +15747,8 @@ static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg, * -1 - unknown. Example: "if (reg1 < 5)" is unknown when register value * range [0,10] */ -static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg2, - u8 opcode, bool is_jmp32) +static int is_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *reg1, + struct bpf_reg_state *reg2, u8 opcode, bool is_jmp32) { if (reg_is_pkt_pointer_any(reg1) && reg_is_pkt_pointer_any(reg2) && !is_jmp32) return is_pkt_ptr_branch_taken(reg1, reg2, opcode); @@ -16863,7 +15786,7 @@ static int is_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_state *reg } /* now deal with two scalars, but not necessarily constants */ - return is_scalar_branch_taken(reg1, reg2, opcode, is_jmp32); + return is_scalar_branch_taken(env, reg1, reg2, opcode, is_jmp32); } /* Opcode that corresponds to a *false* branch condition. @@ -16954,8 +15877,8 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state /* u32_min_value is not equal to 0xffffffff at this point, * because otherwise u32_max_value is 0xffffffff as well, * in such a case both reg1 and reg2 would be constants, - * jump would be predicted and reg_set_min_max() won't - * be called. + * jump would be predicted and regs_refine_cond_op() + * wouldn't be called. * * Same reasoning works for all {u,s}{min,max}{32,64} cases * below. @@ -17062,49 +15985,15 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state } } -/* Adjusts the register min/max values in the case that the dst_reg and - * src_reg are both SCALAR_VALUE registers (or we are simply doing a BPF_K - * check, in which case we have a fake SCALAR_VALUE representing insn->imm). - * Technically we can do similar adjustments for pointers to the same object, - * but we don't support that right now. - */ -static int reg_set_min_max(struct bpf_verifier_env *env, - struct bpf_reg_state *true_reg1, - struct bpf_reg_state *true_reg2, - struct bpf_reg_state *false_reg1, - struct bpf_reg_state *false_reg2, - u8 opcode, bool is_jmp32) +/* Check for invariant violations on the registers for both branches of a condition */ +static int regs_bounds_sanity_check_branches(struct bpf_verifier_env *env) { int err; - /* If either register is a pointer, we can't learn anything about its - * variable offset from the compare (unless they were a pointer into - * the same object, but we don't bother with that). - */ - if (false_reg1->type != SCALAR_VALUE || false_reg2->type != SCALAR_VALUE) - return 0; - - /* We compute branch direction for same SCALAR_VALUE registers in - * is_scalar_branch_taken(). For unknown branch directions (e.g., BPF_JSET) - * on the same registers, we don't need to adjust the min/max values. - */ - if (false_reg1 == false_reg2) - return 0; - - /* fallthrough (FALSE) branch */ - regs_refine_cond_op(false_reg1, false_reg2, rev_opcode(opcode), is_jmp32); - reg_bounds_sync(false_reg1); - reg_bounds_sync(false_reg2); - - /* jump (TRUE) branch */ - regs_refine_cond_op(true_reg1, true_reg2, opcode, is_jmp32); - reg_bounds_sync(true_reg1); - reg_bounds_sync(true_reg2); - - err = reg_bounds_sanity_check(env, true_reg1, "true_reg1"); - err = err ?: reg_bounds_sanity_check(env, true_reg2, "true_reg2"); - err = err ?: reg_bounds_sanity_check(env, false_reg1, "false_reg1"); - err = err ?: reg_bounds_sanity_check(env, false_reg2, "false_reg2"); + err = reg_bounds_sanity_check(env, &env->true_reg1, "true_reg1"); + err = err ?: reg_bounds_sanity_check(env, &env->true_reg2, "true_reg2"); + err = err ?: reg_bounds_sanity_check(env, &env->false_reg1, "false_reg1"); + err = err ?: reg_bounds_sanity_check(env, &env->false_reg2, "false_reg2"); return err; } @@ -17114,29 +16003,24 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, { if (type_may_be_null(reg->type) && reg->id == id && (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) { - /* Old offset (both fixed and variable parts) should have been - * known-zero, because we don't allow pointer arithmetic on - * pointers that might be NULL. If we see this happening, don't - * convert the register. + /* Old offset should have been known-zero, because we don't + * allow pointer arithmetic on pointers that might be NULL. + * If we see this happening, don't convert the register. * * But in some cases, some helpers that return local kptrs - * advance offset for the returned pointer. In those cases, it - * is fine to expect to see reg->off. + * advance offset for the returned pointer. In those cases, + * it is fine to expect to see reg->var_off. */ - if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0))) - return; if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) && - WARN_ON_ONCE(reg->off)) + WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0))) return; - if (is_null) { - reg->type = SCALAR_VALUE; /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ - reg->id = 0; - reg->ref_obj_id = 0; + __mark_reg_known_zero(reg); + reg->type = SCALAR_VALUE; return; } @@ -17297,7 +16181,7 @@ static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_st e->is_reg = is_reg; e->regno = spi_or_reg; } else { - reg->id = 0; + clear_scalar_id(reg); } } @@ -17305,22 +16189,29 @@ static void __collect_linked_regs(struct linked_regs *reg_set, struct bpf_reg_st * in verifier state, save R in linked_regs if R->id == id. * If there are too many Rs sharing same id, reset id for leftover Rs. */ -static void collect_linked_regs(struct bpf_verifier_state *vstate, u32 id, +static void collect_linked_regs(struct bpf_verifier_env *env, + struct bpf_verifier_state *vstate, + u32 id, struct linked_regs *linked_regs) { + struct bpf_insn_aux_data *aux = env->insn_aux_data; struct bpf_func_state *func; struct bpf_reg_state *reg; + u16 live_regs; int i, j; id = id & ~BPF_ADD_CONST; for (i = vstate->curframe; i >= 0; i--) { + live_regs = aux[bpf_frame_insn_idx(vstate, i)].live_regs_before; func = vstate->frame[i]; for (j = 0; j < BPF_REG_FP; j++) { + if (!(live_regs & BIT(j))) + continue; reg = &func->regs[j]; __collect_linked_regs(linked_regs, reg, id, i, j, true); } for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { - if (!is_spilled_reg(&func->stack[j])) + if (!bpf_is_spilled_reg(&func->stack[j])) continue; reg = &func->stack[j].spilled_ptr; __collect_linked_regs(linked_regs, reg, id, i, j, false); @@ -17347,19 +16238,25 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s continue; if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST)) continue; + /* + * Skip mixed 32/64-bit links: the delta relationship doesn't + * hold across different ALU widths. + */ + if (((reg->id ^ known_reg->id) & BPF_ADD_CONST) == BPF_ADD_CONST) + continue; if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || - reg->off == known_reg->off) { + reg->delta == known_reg->delta) { s32 saved_subreg_def = reg->subreg_def; copy_register_state(reg, known_reg); reg->subreg_def = saved_subreg_def; } else { s32 saved_subreg_def = reg->subreg_def; - s32 saved_off = reg->off; + s32 saved_off = reg->delta; u32 saved_id = reg->id; fake_reg.type = SCALAR_VALUE; - __mark_reg_known(&fake_reg, (s64)reg->off - (s64)known_reg->off); + __mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta); /* reg = known_reg; reg += delta */ copy_register_state(reg, known_reg); @@ -17367,14 +16264,14 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s * Must preserve off, id and subreg_def flag, * otherwise another sync_linked_regs() will be incorrect. */ - reg->off = saved_off; + reg->delta = saved_off; reg->id = saved_id; reg->subreg_def = saved_subreg_def; scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); reg->var_off = tnum_add(reg->var_off, fake_reg.var_off); - if (known_reg->id & BPF_ADD_CONST32) + if ((reg->id | known_reg->id) & BPF_ADD_CONST32) zext_32_to_64(reg); reg_bounds_sync(reg); } @@ -17410,12 +16307,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_verifier_state *cur_st = env->cur_state, *queued_st, *prev_st; int idx = *insn_idx; - if (insn->code != (BPF_JMP | BPF_JCOND) || - insn->src_reg != BPF_MAY_GOTO || - insn->dst_reg || insn->imm) { - verbose(env, "invalid may_goto imm %d\n", insn->imm); - return -EINVAL; - } prev_st = find_prev_entry(env, cur_st->parent, idx); /* branch out 'fallthrough' insn as a new state to explore */ @@ -17437,11 +16328,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, dst_reg = ®s[insn->dst_reg]; if (BPF_SRC(insn->code) == BPF_X) { - if (insn->imm != 0) { - verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); - return -EINVAL; - } - /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -17460,10 +16346,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == PTR_TO_STACK) insn_flags |= INSN_F_DST_REG_STACK; } else { - if (insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); - return -EINVAL; - } src_reg = &env->fake_reg[0]; memset(src_reg, 0, sizeof(*src_reg)); src_reg->type = SCALAR_VALUE; @@ -17474,13 +16356,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (insn_flags) { - err = push_jmp_history(env, this_branch, insn_flags, 0); + err = bpf_push_jmp_history(env, this_branch, insn_flags, 0); if (err) return err; } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - pred = is_branch_taken(dst_reg, src_reg, opcode, is_jmp32); + copy_register_state(&env->false_reg1, dst_reg); + copy_register_state(&env->false_reg2, src_reg); + copy_register_state(&env->true_reg1, dst_reg); + copy_register_state(&env->true_reg2, src_reg); + pred = is_branch_taken(env, dst_reg, src_reg, opcode, is_jmp32); if (pred >= 0) { /* If we get here with a dst_reg pointer type it is because * above is_branch_taken() special cased the 0 comparison. @@ -17530,11 +16416,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * if parent state is created. */ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id) - collect_linked_regs(this_branch, src_reg->id, &linked_regs); + collect_linked_regs(env, this_branch, src_reg->id, &linked_regs); if (dst_reg->type == SCALAR_VALUE && dst_reg->id) - collect_linked_regs(this_branch, dst_reg->id, &linked_regs); + collect_linked_regs(env, this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = bpf_push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); if (err) return err; } @@ -17544,27 +16430,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return PTR_ERR(other_branch); other_branch_regs = other_branch->frame[other_branch->curframe]->regs; - if (BPF_SRC(insn->code) == BPF_X) { - err = reg_set_min_max(env, - &other_branch_regs[insn->dst_reg], - &other_branch_regs[insn->src_reg], - dst_reg, src_reg, opcode, is_jmp32); - } else /* BPF_SRC(insn->code) == BPF_K */ { - /* reg_set_min_max() can mangle the fake_reg. Make a copy - * so that these are two different memory locations. The - * src_reg is not used beyond here in context of K. - */ - memcpy(&env->fake_reg[1], &env->fake_reg[0], - sizeof(env->fake_reg[0])); - err = reg_set_min_max(env, - &other_branch_regs[insn->dst_reg], - &env->fake_reg[0], - dst_reg, &env->fake_reg[1], - opcode, is_jmp32); - } + err = regs_bounds_sanity_check_branches(env); if (err) return err; + copy_register_state(dst_reg, &env->false_reg1); + copy_register_state(src_reg, &env->false_reg2); + copy_register_state(&other_branch_regs[insn->dst_reg], &env->true_reg1); + if (BPF_SRC(insn->code) == BPF_X) + copy_register_state(&other_branch_regs[insn->src_reg], &env->true_reg2); + if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && !WARN_ON_ONCE(src_reg->id != other_branch_regs[insn->src_reg].id)) { @@ -17617,12 +16492,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). + * Also does the same detection for a register whose the value is + * known to be 0. * NOTE: these optimizations below are related with pointer comparison * which will never be JMP32. */ - if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && - insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - type_may_be_null(dst_reg->type)) { + if (!is_jmp32 && (opcode == BPF_JEQ || opcode == BPF_JNE) && + type_may_be_null(dst_reg->type) && + ((BPF_SRC(insn->code) == BPF_K && insn->imm == 0) || + (BPF_SRC(insn->code) == BPF_X && bpf_register_is_null(src_reg)))) { /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ @@ -17655,10 +16533,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) verbose(env, "invalid BPF_LD_IMM insn\n"); return -EINVAL; } - if (insn->off != 0) { - verbose(env, "BPF_LD_IMM64 uses reserved fields\n"); - return -EINVAL; - } err = check_reg_arg(env, insn->dst_reg, DST_OP); if (err) @@ -17698,8 +16572,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (insn->src_reg == BPF_PSEUDO_FUNC) { struct bpf_prog_aux *aux = env->prog->aux; - u32 subprogno = find_subprog(env, - env->insn_idx + insn->imm + 1); + u32 subprogno = bpf_find_subprog(env, + env->insn_idx + insn->imm + 1); if (!aux->func_info) { verbose(env, "missing btf func_info\n"); @@ -17716,22 +16590,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } map = env->used_maps[aux->map_index]; - dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { if (map->map_type == BPF_MAP_TYPE_ARENA) { __mark_reg_unknown(env, dst_reg); + dst_reg->map_ptr = map; return 0; } + __mark_reg_known(dst_reg, aux->map_off); dst_reg->type = PTR_TO_MAP_VALUE; - dst_reg->off = aux->map_off; + dst_reg->map_ptr = map; WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY && map->max_entries != 1); /* We want reg->id to be same (0) as map_value is not distinct */ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { dst_reg->type = CONST_PTR_TO_MAP; + dst_reg->map_ptr = map; } else { verifier_bug(env, "unexpected src reg value for ldimm64"); return -EFAULT; @@ -17784,13 +16660,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EFAULT; } - if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || - BPF_SIZE(insn->code) == BPF_DW || - (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n"); - return -EINVAL; - } - /* check whether implicit source operand (register R6) is readable */ err = check_reg_arg(env, ctx_reg, SRC_OP); if (err) @@ -17823,7 +16692,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(env, regs, caller_saved[i]); + bpf_mark_reg_not_init(env, ®s[caller_saved[i]]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } @@ -17834,107 +16703,59 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) mark_reg_unknown(env, regs, BPF_REG_0); /* ld_abs load up to 32-bit skb data. */ regs[BPF_REG_0].subreg_def = env->insn_idx + 1; + /* + * See bpf_gen_ld_abs() which emits a hidden BPF_EXIT with r0=0 + * which must be explored by the verifier when in a subprog. + */ + if (env->cur_state->curframe) { + struct bpf_verifier_state *branch; + + mark_reg_scratched(env, BPF_REG_0); + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + mark_reg_known_zero(env, regs, BPF_REG_0); + err = prepare_func_exit(env, &env->insn_idx); + if (err) + return err; + env->insn_idx--; + } return 0; } -static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name) + +static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_range *range) { - const char *exit_ctx = "At program exit"; - struct tnum enforce_attach_type_range = tnum_unknown; - const struct bpf_prog *prog = env->prog; - struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_retval_range range = retval_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); - int err; - struct bpf_func_state *frame = env->cur_state->frame[0]; - const bool is_subprog = frame->subprogno; - bool return_32bit = false; - const struct btf_type *reg_type, *ret_type = NULL; - /* LSM and struct_ops func-ptr's return type could be "void" */ - if (!is_subprog || frame->in_exception_callback_fn) { - switch (prog_type) { - case BPF_PROG_TYPE_LSM: - if (prog->expected_attach_type == BPF_LSM_CGROUP) - /* See below, can be 0 or 0-1 depending on hook. */ - break; - if (!prog->aux->attach_func_proto->type) - return 0; - break; - case BPF_PROG_TYPE_STRUCT_OPS: - if (!prog->aux->attach_func_proto->type) - return 0; + /* Default return value range. */ + *range = retval_range(0, 1); - if (frame->in_exception_callback_fn) - break; - - /* Allow a struct_ops program to return a referenced kptr if it - * matches the operator's return type and is in its unmodified - * form. A scalar zero (i.e., a null pointer) is also allowed. - */ - reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL; - ret_type = btf_type_resolve_ptr(prog->aux->attach_btf, - prog->aux->attach_func_proto->type, - NULL); - if (ret_type && ret_type == reg_type && reg->ref_obj_id) - return __check_ptr_off_reg(env, reg, regno, false); + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + switch (env->prog->expected_attach_type) { + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: + *range = retval_range(1, 1); + break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + *range = retval_range(0, 3); break; default: break; } - } - - /* eBPF calling convention is such that R0 is used - * to return the value from eBPF program. - * Make sure that it's readable at this time - * of bpf_exit, which means that program wrote - * something into it earlier - */ - err = check_reg_arg(env, regno, SRC_OP); - if (err) - return err; - - if (is_pointer_value(env, regno)) { - verbose(env, "R%d leaks addr as return value\n", regno); - return -EACCES; - } - - if (frame->in_async_callback_fn) { - exit_ctx = "At async callback return"; - range = frame->callback_ret_range; - goto enforce_retval; - } - - if (is_subprog && !frame->in_exception_callback_fn) { - if (reg->type != SCALAR_VALUE) { - verbose(env, "At subprogram exit the register R%d is not a scalar value (%s)\n", - regno, reg_type_str(env, reg->type)); - return -EINVAL; - } - return 0; - } - - switch (prog_type) { - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || - env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME || - env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME) - range = retval_range(1, 1); - if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || - env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) - range = retval_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SKB: - if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { - range = retval_range(0, 3); - enforce_attach_type_range = tnum_range(2, 3); - } + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) + *range = retval_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_SOCK_OPS: @@ -17944,72 +16765,164 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char break; case BPF_PROG_TYPE_RAW_TRACEPOINT: if (!env->prog->aux->attach_btf_id) - return 0; - range = retval_range(0, 0); + return false; + *range = retval_range(0, 0); break; case BPF_PROG_TYPE_TRACING: switch (env->prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: - range = retval_range(0, 0); + *range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: case BPF_MODIFY_RETURN: - return 0; + return false; case BPF_TRACE_ITER: - break; default: - return -ENOTSUPP; + break; } break; case BPF_PROG_TYPE_KPROBE: switch (env->prog->expected_attach_type) { case BPF_TRACE_KPROBE_SESSION: case BPF_TRACE_UPROBE_SESSION: - range = retval_range(0, 1); break; default: - return 0; + return false; } break; case BPF_PROG_TYPE_SK_LOOKUP: - range = retval_range(SK_DROP, SK_PASS); + *range = retval_range(SK_DROP, SK_PASS); break; case BPF_PROG_TYPE_LSM: if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { /* no range found, any return value is allowed */ - if (!get_func_retval_range(env->prog, &range)) - return 0; + if (!get_func_retval_range(env->prog, range)) + return false; /* no restricted range, any return value is allowed */ - if (range.minval == S32_MIN && range.maxval == S32_MAX) - return 0; - return_32bit = true; + if (range->minval == S32_MIN && range->maxval == S32_MAX) + return false; + range->return_32bit = true; } else if (!env->prog->aux->attach_func_proto->type) { /* Make sure programs that attach to void * hooks don't try to modify return value. */ - range = retval_range(1, 1); + *range = retval_range(1, 1); } break; case BPF_PROG_TYPE_NETFILTER: - range = retval_range(NF_DROP, NF_ACCEPT); + *range = retval_range(NF_DROP, NF_ACCEPT); break; case BPF_PROG_TYPE_STRUCT_OPS: - if (!ret_type) - return 0; - range = retval_range(0, 0); + *range = retval_range(0, 0); break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. */ default: + return false; + } + + /* Continue calculating. */ + + return true; +} + +static bool program_returns_void(struct bpf_verifier_env *env) +{ + const struct bpf_prog *prog = env->prog; + enum bpf_prog_type prog_type = prog->type; + + switch (prog_type) { + case BPF_PROG_TYPE_LSM: + /* See return_retval_range, for BPF_LSM_CGROUP can be 0 or 0-1 depending on hook. */ + if (prog->expected_attach_type != BPF_LSM_CGROUP && + !prog->aux->attach_func_proto->type) + return true; + break; + case BPF_PROG_TYPE_STRUCT_OPS: + if (!prog->aux->attach_func_proto->type) + return true; + break; + case BPF_PROG_TYPE_EXT: + /* + * If the actual program is an extension, let it + * return void - attaching will succeed only if the + * program being replaced also returns void, and since + * it has passed verification its actual type doesn't matter. + */ + if (subprog_returns_void(env, 0)) + return true; + break; + default: + break; + } + return false; +} + +static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name) +{ + const char *exit_ctx = "At program exit"; + struct tnum enforce_attach_type_range = tnum_unknown; + const struct bpf_prog *prog = env->prog; + struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_retval_range range = retval_range(0, 1); + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + struct bpf_func_state *frame = env->cur_state->frame[0]; + const struct btf_type *reg_type, *ret_type = NULL; + int err; + + /* LSM and struct_ops func-ptr's return type could be "void" */ + if (!frame->in_async_callback_fn && program_returns_void(env)) return 0; + + if (prog_type == BPF_PROG_TYPE_STRUCT_OPS) { + /* Allow a struct_ops program to return a referenced kptr if it + * matches the operator's return type and is in its unmodified + * form. A scalar zero (i.e., a null pointer) is also allowed. + */ + reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL; + ret_type = btf_type_resolve_ptr(prog->aux->attach_btf, + prog->aux->attach_func_proto->type, + NULL); + if (ret_type && ret_type == reg_type && reg->ref_obj_id) + return __check_ptr_off_reg(env, reg, regno, false); } + /* eBPF calling convention is such that R0 is used + * to return the value from eBPF program. + * Make sure that it's readable at this time + * of bpf_exit, which means that program wrote + * something into it earlier + */ + err = check_reg_arg(env, regno, SRC_OP); + if (err) + return err; + + if (is_pointer_value(env, regno)) { + verbose(env, "R%d leaks addr as return value\n", regno); + return -EACCES; + } + + if (frame->in_async_callback_fn) { + exit_ctx = "At async callback return"; + range = frame->callback_ret_range; + goto enforce_retval; + } + + if (prog_type == BPF_PROG_TYPE_STRUCT_OPS && !ret_type) + return 0; + + if (prog_type == BPF_PROG_TYPE_CGROUP_SKB && (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS)) + enforce_attach_type_range = tnum_range(2, 3); + + if (!return_retval_range(env, &range)) + return 0; + enforce_retval: if (reg->type != SCALAR_VALUE) { verbose(env, "%s the register R%d is not a known value (%s)\n", @@ -18021,10 +16934,9 @@ enforce_retval: if (err) return err; - if (!retval_range_within(range, reg, return_32bit)) { + if (!retval_range_within(range, reg)) { verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name); - if (!is_subprog && - prog->expected_attach_type == BPF_LSM_CGROUP && + if (prog->expected_attach_type == BPF_LSM_CGROUP && prog_type == BPF_PROG_TYPE_LSM && !prog->aux->attach_func_proto->type) verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); @@ -18037,189 +16949,31 @@ enforce_retval: return 0; } -static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) -{ - struct bpf_subprog_info *subprog; - - subprog = bpf_find_containing_subprog(env, off); - subprog->changes_pkt_data = true; -} - -static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) -{ - struct bpf_subprog_info *subprog; - - subprog = bpf_find_containing_subprog(env, off); - subprog->might_sleep = true; -} - -/* 't' is an index of a call-site. - * 'w' is a callee entry point. - * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. - * Rely on DFS traversal order and absence of recursive calls to guarantee that - * callee's change_pkt_data marks would be correct at that moment. - */ -static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) -{ - struct bpf_subprog_info *caller, *callee; - - caller = bpf_find_containing_subprog(env, t); - callee = bpf_find_containing_subprog(env, w); - caller->changes_pkt_data |= callee->changes_pkt_data; - caller->might_sleep |= callee->might_sleep; -} - -/* non-recursive DFS pseudo code - * 1 procedure DFS-iterative(G,v): - * 2 label v as discovered - * 3 let S be a stack - * 4 S.push(v) - * 5 while S is not empty - * 6 t <- S.peek() - * 7 if t is what we're looking for: - * 8 return t - * 9 for all edges e in G.adjacentEdges(t) do - * 10 if edge e is already labelled - * 11 continue with the next edge - * 12 w <- G.adjacentVertex(t,e) - * 13 if vertex w is not discovered and not explored - * 14 label e as tree-edge - * 15 label w as discovered - * 16 S.push(w) - * 17 continue at 5 - * 18 else if vertex w is discovered - * 19 label e as back-edge - * 20 else - * 21 // vertex w is explored - * 22 label e as forward- or cross-edge - * 23 label t as explored - * 24 S.pop() - * - * convention: - * 0x10 - discovered - * 0x11 - discovered and fall-through edge labelled - * 0x12 - discovered and fall-through and branch edges labelled - * 0x20 - explored - */ - -enum { - DISCOVERED = 0x10, - EXPLORED = 0x20, - FALLTHROUGH = 1, - BRANCH = 2, -}; - -static void mark_prune_point(struct bpf_verifier_env *env, int idx) -{ - env->insn_aux_data[idx].prune_point = true; -} - -static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx) -{ - return env->insn_aux_data[insn_idx].prune_point; -} - -static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx) -{ - env->insn_aux_data[idx].force_checkpoint = true; -} - -static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx) -{ - return env->insn_aux_data[insn_idx].force_checkpoint; -} - -static void mark_calls_callback(struct bpf_verifier_env *env, int idx) -{ - env->insn_aux_data[idx].calls_callback = true; -} - -bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx) -{ - return env->insn_aux_data[insn_idx].calls_callback; -} - -enum { - DONE_EXPLORING = 0, - KEEP_EXPLORING = 1, -}; - -/* t, w, e - match pseudo-code above: - * t - index of current instruction - * w - next instruction - * e - edge - */ -static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) +static int check_global_subprog_return_code(struct bpf_verifier_env *env) { - int *insn_stack = env->cfg.insn_stack; - int *insn_state = env->cfg.insn_state; - - if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) - return DONE_EXPLORING; + struct bpf_reg_state *reg = reg_state(env, BPF_REG_0); + struct bpf_func_state *cur_frame = cur_func(env); + int err; - if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) - return DONE_EXPLORING; + if (subprog_returns_void(env, cur_frame->subprogno)) + return 0; - if (w < 0 || w >= env->prog->len) { - verbose_linfo(env, t, "%d: ", t); - verbose(env, "jump out of range from insn %d to %d\n", t, w); - return -EINVAL; - } + err = check_reg_arg(env, BPF_REG_0, SRC_OP); + if (err) + return err; - if (e == BRANCH) { - /* mark branch target for state pruning */ - mark_prune_point(env, w); - mark_jmp_point(env, w); + if (is_pointer_value(env, BPF_REG_0)) { + verbose(env, "R%d leaks addr as return value\n", BPF_REG_0); + return -EACCES; } - if (insn_state[w] == 0) { - /* tree-edge */ - insn_state[t] = DISCOVERED | e; - insn_state[w] = DISCOVERED; - if (env->cfg.cur_stack >= env->prog->len) - return -E2BIG; - insn_stack[env->cfg.cur_stack++] = w; - return KEEP_EXPLORING; - } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (env->bpf_capable) - return DONE_EXPLORING; - verbose_linfo(env, t, "%d: ", t); - verbose_linfo(env, w, "%d: ", w); - verbose(env, "back-edge from insn %d to %d\n", t, w); + if (reg->type != SCALAR_VALUE) { + verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", + reg_type_str(env, reg->type)); return -EINVAL; - } else if (insn_state[w] == EXPLORED) { - /* forward- or cross-edge */ - insn_state[t] = DISCOVERED | e; - } else { - verifier_bug(env, "insn state internal bug"); - return -EFAULT; } - return DONE_EXPLORING; -} -static int visit_func_call_insn(int t, struct bpf_insn *insns, - struct bpf_verifier_env *env, - bool visit_callee) -{ - int ret, insn_sz; - int w; - - insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; - ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); - if (ret) - return ret; - - mark_prune_point(env, t + insn_sz); - /* when we exit from subprog, we need to record non-linear history */ - mark_jmp_point(env, t + insn_sz); - - if (visit_callee) { - w = t + insns[t].imm + 1; - mark_prune_point(env, t); - merge_callee_effects(env, t, w); - ret = push_insn(t, w, BRANCH, env); - } - return ret; + return 0; } /* Bitmask with 1s for all caller saved registers */ @@ -18229,7 +16983,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, * replacement patch is presumed to follow bpf_fastcall contract * (see mark_fastcall_pattern_for_call() below). */ -static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) +bool bpf_verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) { switch (imm) { #ifdef CONFIG_X86_64 @@ -18245,17 +16999,11 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) } } -struct call_summary { - u8 num_params; - bool is_void; - bool fastcall; -}; - /* If @call is a kfunc or helper call, fills @cs and returns true, * otherwise returns false. */ -static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call, - struct call_summary *cs) +bool bpf_get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call, + struct bpf_call_summary *cs) { struct bpf_kfunc_call_arg_meta meta; const struct bpf_func_proto *fn; @@ -18263,11 +17011,11 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call if (bpf_helper_call(call)) { - if (get_helper_proto(env, call->imm, &fn) < 0) + if (bpf_get_helper_proto(env, call->imm, &fn) < 0) /* error would be reported later */ return false; cs->fastcall = fn->allow_fastcall && - (verifier_inlines_helper_call(env, call->imm) || + (bpf_verifier_inlines_helper_call(env, call->imm) || bpf_jit_inlines_helper_call(call->imm)); cs->is_void = fn->ret_type == RET_VOID; cs->num_params = 0; @@ -18282,7 +17030,7 @@ static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call if (bpf_pseudo_kfunc_call(call)) { int err; - err = fetch_kfunc_arg_meta(env, call->imm, call->off, &meta); + err = bpf_fetch_kfunc_arg_meta(env, call->imm, call->off, &meta); if (err < 0) /* error would be reported later */ return false; @@ -18376,12 +17124,12 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; struct bpf_insn *call = &env->prog->insnsi[insn_idx]; u32 clobbered_regs_mask; - struct call_summary cs; + struct bpf_call_summary cs; u32 expected_regs_mask; s16 off; int i; - if (!get_call_summary(env, call, &cs)) + if (!bpf_get_call_summary(env, call, &cs)) return; /* A bitmask specifying which caller saved registers are clobbered @@ -18484,714 +17232,6 @@ static int mark_fastcall_patterns(struct bpf_verifier_env *env) return 0; } -static struct bpf_iarray *iarray_realloc(struct bpf_iarray *old, size_t n_elem) -{ - size_t new_size = sizeof(struct bpf_iarray) + n_elem * sizeof(old->items[0]); - struct bpf_iarray *new; - - new = kvrealloc(old, new_size, GFP_KERNEL_ACCOUNT); - if (!new) { - /* this is what callers always want, so simplify the call site */ - kvfree(old); - return NULL; - } - - new->cnt = n_elem; - return new; -} - -static int copy_insn_array(struct bpf_map *map, u32 start, u32 end, u32 *items) -{ - struct bpf_insn_array_value *value; - u32 i; - - for (i = start; i <= end; i++) { - value = map->ops->map_lookup_elem(map, &i); - /* - * map_lookup_elem of an array map will never return an error, - * but not checking it makes some static analysers to worry - */ - if (IS_ERR(value)) - return PTR_ERR(value); - else if (!value) - return -EINVAL; - items[i - start] = value->xlated_off; - } - return 0; -} - -static int cmp_ptr_to_u32(const void *a, const void *b) -{ - return *(u32 *)a - *(u32 *)b; -} - -static int sort_insn_array_uniq(u32 *items, int cnt) -{ - int unique = 1; - int i; - - sort(items, cnt, sizeof(items[0]), cmp_ptr_to_u32, NULL); - - for (i = 1; i < cnt; i++) - if (items[i] != items[unique - 1]) - items[unique++] = items[i]; - - return unique; -} - -/* - * sort_unique({map[start], ..., map[end]}) into off - */ -static int copy_insn_array_uniq(struct bpf_map *map, u32 start, u32 end, u32 *off) -{ - u32 n = end - start + 1; - int err; - - err = copy_insn_array(map, start, end, off); - if (err) - return err; - - return sort_insn_array_uniq(off, n); -} - -/* - * Copy all unique offsets from the map - */ -static struct bpf_iarray *jt_from_map(struct bpf_map *map) -{ - struct bpf_iarray *jt; - int err; - int n; - - jt = iarray_realloc(NULL, map->max_entries); - if (!jt) - return ERR_PTR(-ENOMEM); - - n = copy_insn_array_uniq(map, 0, map->max_entries - 1, jt->items); - if (n < 0) { - err = n; - goto err_free; - } - if (n == 0) { - err = -EINVAL; - goto err_free; - } - jt->cnt = n; - return jt; - -err_free: - kvfree(jt); - return ERR_PTR(err); -} - -/* - * Find and collect all maps which fit in the subprog. Return the result as one - * combined jump table in jt->items (allocated with kvcalloc) - */ -static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env, - int subprog_start, int subprog_end) -{ - struct bpf_iarray *jt = NULL; - struct bpf_map *map; - struct bpf_iarray *jt_cur; - int i; - - for (i = 0; i < env->insn_array_map_cnt; i++) { - /* - * TODO (when needed): collect only jump tables, not static keys - * or maps for indirect calls - */ - map = env->insn_array_maps[i]; - - jt_cur = jt_from_map(map); - if (IS_ERR(jt_cur)) { - kvfree(jt); - return jt_cur; - } - - /* - * This is enough to check one element. The full table is - * checked to fit inside the subprog later in create_jt() - */ - if (jt_cur->items[0] >= subprog_start && jt_cur->items[0] < subprog_end) { - u32 old_cnt = jt ? jt->cnt : 0; - jt = iarray_realloc(jt, old_cnt + jt_cur->cnt); - if (!jt) { - kvfree(jt_cur); - return ERR_PTR(-ENOMEM); - } - memcpy(jt->items + old_cnt, jt_cur->items, jt_cur->cnt << 2); - } - - kvfree(jt_cur); - } - - if (!jt) { - verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start); - return ERR_PTR(-EINVAL); - } - - jt->cnt = sort_insn_array_uniq(jt->items, jt->cnt); - return jt; -} - -static struct bpf_iarray * -create_jt(int t, struct bpf_verifier_env *env) -{ - static struct bpf_subprog_info *subprog; - int subprog_start, subprog_end; - struct bpf_iarray *jt; - int i; - - subprog = bpf_find_containing_subprog(env, t); - subprog_start = subprog->start; - subprog_end = (subprog + 1)->start; - jt = jt_from_subprog(env, subprog_start, subprog_end); - if (IS_ERR(jt)) - return jt; - - /* Check that the every element of the jump table fits within the given subprogram */ - for (i = 0; i < jt->cnt; i++) { - if (jt->items[i] < subprog_start || jt->items[i] >= subprog_end) { - verbose(env, "jump table for insn %d points outside of the subprog [%u,%u]\n", - t, subprog_start, subprog_end); - kvfree(jt); - return ERR_PTR(-EINVAL); - } - } - - return jt; -} - -/* "conditional jump with N edges" */ -static int visit_gotox_insn(int t, struct bpf_verifier_env *env) -{ - int *insn_stack = env->cfg.insn_stack; - int *insn_state = env->cfg.insn_state; - bool keep_exploring = false; - struct bpf_iarray *jt; - int i, w; - - jt = env->insn_aux_data[t].jt; - if (!jt) { - jt = create_jt(t, env); - if (IS_ERR(jt)) - return PTR_ERR(jt); - - env->insn_aux_data[t].jt = jt; - } - - mark_prune_point(env, t); - for (i = 0; i < jt->cnt; i++) { - w = jt->items[i]; - if (w < 0 || w >= env->prog->len) { - verbose(env, "indirect jump out of range from insn %d to %d\n", t, w); - return -EINVAL; - } - - mark_jmp_point(env, w); - - /* EXPLORED || DISCOVERED */ - if (insn_state[w]) - continue; - - if (env->cfg.cur_stack >= env->prog->len) - return -E2BIG; - - insn_stack[env->cfg.cur_stack++] = w; - insn_state[w] |= DISCOVERED; - keep_exploring = true; - } - - return keep_exploring ? KEEP_EXPLORING : DONE_EXPLORING; -} - -static int visit_tailcall_insn(struct bpf_verifier_env *env, int t) -{ - static struct bpf_subprog_info *subprog; - struct bpf_iarray *jt; - - if (env->insn_aux_data[t].jt) - return 0; - - jt = iarray_realloc(NULL, 2); - if (!jt) - return -ENOMEM; - - subprog = bpf_find_containing_subprog(env, t); - jt->items[0] = t + 1; - jt->items[1] = subprog->exit_idx; - env->insn_aux_data[t].jt = jt; - return 0; -} - -/* Visits the instruction at index t and returns one of the following: - * < 0 - an error occurred - * DONE_EXPLORING - the instruction was fully explored - * KEEP_EXPLORING - there is still work to be done before it is fully explored - */ -static int visit_insn(int t, struct bpf_verifier_env *env) -{ - struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; - int ret, off, insn_sz; - - if (bpf_pseudo_func(insn)) - return visit_func_call_insn(t, insns, env, true); - - /* All non-branch instructions have a single fall-through edge. */ - if (BPF_CLASS(insn->code) != BPF_JMP && - BPF_CLASS(insn->code) != BPF_JMP32) { - insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; - return push_insn(t, t + insn_sz, FALLTHROUGH, env); - } - - switch (BPF_OP(insn->code)) { - case BPF_EXIT: - return DONE_EXPLORING; - - case BPF_CALL: - if (is_async_callback_calling_insn(insn)) - /* Mark this call insn as a prune point to trigger - * is_state_visited() check before call itself is - * processed by __check_func_call(). Otherwise new - * async state will be pushed for further exploration. - */ - mark_prune_point(env, t); - /* For functions that invoke callbacks it is not known how many times - * callback would be called. Verifier models callback calling functions - * by repeatedly visiting callback bodies and returning to origin call - * instruction. - * In order to stop such iteration verifier needs to identify when a - * state identical some state from a previous iteration is reached. - * Check below forces creation of checkpoint before callback calling - * instruction to allow search for such identical states. - */ - if (is_sync_callback_calling_insn(insn)) { - mark_calls_callback(env, t); - mark_force_checkpoint(env, t); - mark_prune_point(env, t); - mark_jmp_point(env, t); - } - if (bpf_helper_call(insn)) { - const struct bpf_func_proto *fp; - - ret = get_helper_proto(env, insn->imm, &fp); - /* If called in a non-sleepable context program will be - * rejected anyway, so we should end up with precise - * sleepable marks on subprogs, except for dead code - * elimination. - */ - if (ret == 0 && fp->might_sleep) - mark_subprog_might_sleep(env, t); - if (bpf_helper_changes_pkt_data(insn->imm)) - mark_subprog_changes_pkt_data(env, t); - if (insn->imm == BPF_FUNC_tail_call) - visit_tailcall_insn(env, t); - } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { - struct bpf_kfunc_call_arg_meta meta; - - ret = fetch_kfunc_arg_meta(env, insn->imm, insn->off, &meta); - if (ret == 0 && is_iter_next_kfunc(&meta)) { - mark_prune_point(env, t); - /* Checking and saving state checkpoints at iter_next() call - * is crucial for fast convergence of open-coded iterator loop - * logic, so we need to force it. If we don't do that, - * is_state_visited() might skip saving a checkpoint, causing - * unnecessarily long sequence of not checkpointed - * instructions and jumps, leading to exhaustion of jump - * history buffer, and potentially other undesired outcomes. - * It is expected that with correct open-coded iterators - * convergence will happen quickly, so we don't run a risk of - * exhausting memory. - */ - mark_force_checkpoint(env, t); - } - /* Same as helpers, if called in a non-sleepable context - * program will be rejected anyway, so we should end up - * with precise sleepable marks on subprogs, except for - * dead code elimination. - */ - if (ret == 0 && is_kfunc_sleepable(&meta)) - mark_subprog_might_sleep(env, t); - if (ret == 0 && is_kfunc_pkt_changing(&meta)) - mark_subprog_changes_pkt_data(env, t); - } - return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); - - case BPF_JA: - if (BPF_SRC(insn->code) == BPF_X) - return visit_gotox_insn(t, env); - - if (BPF_CLASS(insn->code) == BPF_JMP) - off = insn->off; - else - off = insn->imm; - - /* unconditional jump with single edge */ - ret = push_insn(t, t + off + 1, FALLTHROUGH, env); - if (ret) - return ret; - - mark_prune_point(env, t + off + 1); - mark_jmp_point(env, t + off + 1); - - return ret; - - default: - /* conditional jump with two edges */ - mark_prune_point(env, t); - if (is_may_goto_insn(insn)) - mark_force_checkpoint(env, t); - - ret = push_insn(t, t + 1, FALLTHROUGH, env); - if (ret) - return ret; - - return push_insn(t, t + insn->off + 1, BRANCH, env); - } -} - -/* non-recursive depth-first-search to detect loops in BPF program - * loop == back-edge in directed graph - */ -static int check_cfg(struct bpf_verifier_env *env) -{ - int insn_cnt = env->prog->len; - int *insn_stack, *insn_state; - int ex_insn_beg, i, ret = 0; - - insn_state = env->cfg.insn_state = kvzalloc_objs(int, insn_cnt, - GFP_KERNEL_ACCOUNT); - if (!insn_state) - return -ENOMEM; - - insn_stack = env->cfg.insn_stack = kvzalloc_objs(int, insn_cnt, - GFP_KERNEL_ACCOUNT); - if (!insn_stack) { - kvfree(insn_state); - return -ENOMEM; - } - - ex_insn_beg = env->exception_callback_subprog - ? env->subprog_info[env->exception_callback_subprog].start - : 0; - - insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ - insn_stack[0] = 0; /* 0 is the first instruction */ - env->cfg.cur_stack = 1; - -walk_cfg: - while (env->cfg.cur_stack > 0) { - int t = insn_stack[env->cfg.cur_stack - 1]; - - ret = visit_insn(t, env); - switch (ret) { - case DONE_EXPLORING: - insn_state[t] = EXPLORED; - env->cfg.cur_stack--; - break; - case KEEP_EXPLORING: - break; - default: - if (ret > 0) { - verifier_bug(env, "visit_insn internal bug"); - ret = -EFAULT; - } - goto err_free; - } - } - - if (env->cfg.cur_stack < 0) { - verifier_bug(env, "pop stack internal bug"); - ret = -EFAULT; - goto err_free; - } - - if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) { - insn_state[ex_insn_beg] = DISCOVERED; - insn_stack[0] = ex_insn_beg; - env->cfg.cur_stack = 1; - goto walk_cfg; - } - - for (i = 0; i < insn_cnt; i++) { - struct bpf_insn *insn = &env->prog->insnsi[i]; - - if (insn_state[i] != EXPLORED) { - verbose(env, "unreachable insn %d\n", i); - ret = -EINVAL; - goto err_free; - } - if (bpf_is_ldimm64(insn)) { - if (insn_state[i + 1] != 0) { - verbose(env, "jump into the middle of ldimm64 insn %d\n", i); - ret = -EINVAL; - goto err_free; - } - i++; /* skip second half of ldimm64 */ - } - } - ret = 0; /* cfg looks good */ - env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; - env->prog->aux->might_sleep = env->subprog_info[0].might_sleep; - -err_free: - kvfree(insn_state); - kvfree(insn_stack); - env->cfg.insn_state = env->cfg.insn_stack = NULL; - return ret; -} - -/* - * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range - * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start) - * with indices of 'i' instructions in postorder. - */ -static int compute_postorder(struct bpf_verifier_env *env) -{ - u32 cur_postorder, i, top, stack_sz, s; - int *stack = NULL, *postorder = NULL, *state = NULL; - struct bpf_iarray *succ; - - postorder = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT); - state = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT); - stack = kvzalloc_objs(int, env->prog->len, GFP_KERNEL_ACCOUNT); - if (!postorder || !state || !stack) { - kvfree(postorder); - kvfree(state); - kvfree(stack); - return -ENOMEM; - } - cur_postorder = 0; - for (i = 0; i < env->subprog_cnt; i++) { - env->subprog_info[i].postorder_start = cur_postorder; - stack[0] = env->subprog_info[i].start; - stack_sz = 1; - do { - top = stack[stack_sz - 1]; - state[top] |= DISCOVERED; - if (state[top] & EXPLORED) { - postorder[cur_postorder++] = top; - stack_sz--; - continue; - } - succ = bpf_insn_successors(env, top); - for (s = 0; s < succ->cnt; ++s) { - if (!state[succ->items[s]]) { - stack[stack_sz++] = succ->items[s]; - state[succ->items[s]] |= DISCOVERED; - } - } - state[top] |= EXPLORED; - } while (stack_sz); - } - env->subprog_info[i].postorder_start = cur_postorder; - env->cfg.insn_postorder = postorder; - env->cfg.cur_postorder = cur_postorder; - kvfree(stack); - kvfree(state); - return 0; -} - -static int check_abnormal_return(struct bpf_verifier_env *env) -{ - int i; - - for (i = 1; i < env->subprog_cnt; i++) { - if (env->subprog_info[i].has_ld_abs) { - verbose(env, "LD_ABS is not allowed in subprogs without BTF\n"); - return -EINVAL; - } - if (env->subprog_info[i].has_tail_call) { - verbose(env, "tail_call is not allowed in subprogs without BTF\n"); - return -EINVAL; - } - } - return 0; -} - -/* The minimum supported BTF func info size */ -#define MIN_BPF_FUNCINFO_SIZE 8 -#define MAX_FUNCINFO_REC_SIZE 252 - -static int check_btf_func_early(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) -{ - u32 krec_size = sizeof(struct bpf_func_info); - const struct btf_type *type, *func_proto; - u32 i, nfuncs, urec_size, min_size; - struct bpf_func_info *krecord; - struct bpf_prog *prog; - const struct btf *btf; - u32 prev_offset = 0; - bpfptr_t urecord; - int ret = -ENOMEM; - - nfuncs = attr->func_info_cnt; - if (!nfuncs) { - if (check_abnormal_return(env)) - return -EINVAL; - return 0; - } - - urec_size = attr->func_info_rec_size; - if (urec_size < MIN_BPF_FUNCINFO_SIZE || - urec_size > MAX_FUNCINFO_REC_SIZE || - urec_size % sizeof(u32)) { - verbose(env, "invalid func info rec size %u\n", urec_size); - return -EINVAL; - } - - prog = env->prog; - btf = prog->aux->btf; - - urecord = make_bpfptr(attr->func_info, uattr.is_kernel); - min_size = min_t(u32, krec_size, urec_size); - - krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); - if (!krecord) - return -ENOMEM; - - for (i = 0; i < nfuncs; i++) { - ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); - if (ret) { - if (ret == -E2BIG) { - verbose(env, "nonzero tailing record in func info"); - /* set the size kernel expects so loader can zero - * out the rest of the record. - */ - if (copy_to_bpfptr_offset(uattr, - offsetof(union bpf_attr, func_info_rec_size), - &min_size, sizeof(min_size))) - ret = -EFAULT; - } - goto err_free; - } - - if (copy_from_bpfptr(&krecord[i], urecord, min_size)) { - ret = -EFAULT; - goto err_free; - } - - /* check insn_off */ - ret = -EINVAL; - if (i == 0) { - if (krecord[i].insn_off) { - verbose(env, - "nonzero insn_off %u for the first func info record", - krecord[i].insn_off); - goto err_free; - } - } else if (krecord[i].insn_off <= prev_offset) { - verbose(env, - "same or smaller insn offset (%u) than previous func info record (%u)", - krecord[i].insn_off, prev_offset); - goto err_free; - } - - /* check type_id */ - type = btf_type_by_id(btf, krecord[i].type_id); - if (!type || !btf_type_is_func(type)) { - verbose(env, "invalid type id %d in func info", - krecord[i].type_id); - goto err_free; - } - - func_proto = btf_type_by_id(btf, type->type); - if (unlikely(!func_proto || !btf_type_is_func_proto(func_proto))) - /* btf_func_check() already verified it during BTF load */ - goto err_free; - - prev_offset = krecord[i].insn_off; - bpfptr_add(&urecord, urec_size); - } - - prog->aux->func_info = krecord; - prog->aux->func_info_cnt = nfuncs; - return 0; - -err_free: - kvfree(krecord); - return ret; -} - -static int check_btf_func(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) -{ - const struct btf_type *type, *func_proto, *ret_type; - u32 i, nfuncs, urec_size; - struct bpf_func_info *krecord; - struct bpf_func_info_aux *info_aux = NULL; - struct bpf_prog *prog; - const struct btf *btf; - bpfptr_t urecord; - bool scalar_return; - int ret = -ENOMEM; - - nfuncs = attr->func_info_cnt; - if (!nfuncs) { - if (check_abnormal_return(env)) - return -EINVAL; - return 0; - } - if (nfuncs != env->subprog_cnt) { - verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); - return -EINVAL; - } - - urec_size = attr->func_info_rec_size; - - prog = env->prog; - btf = prog->aux->btf; - - urecord = make_bpfptr(attr->func_info, uattr.is_kernel); - - krecord = prog->aux->func_info; - info_aux = kzalloc_objs(*info_aux, nfuncs, - GFP_KERNEL_ACCOUNT | __GFP_NOWARN); - if (!info_aux) - return -ENOMEM; - - for (i = 0; i < nfuncs; i++) { - /* check insn_off */ - ret = -EINVAL; - - if (env->subprog_info[i].start != krecord[i].insn_off) { - verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); - goto err_free; - } - - /* Already checked type_id */ - type = btf_type_by_id(btf, krecord[i].type_id); - info_aux[i].linkage = BTF_INFO_VLEN(type->info); - /* Already checked func_proto */ - func_proto = btf_type_by_id(btf, type->type); - - ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL); - scalar_return = - btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type); - if (i && !scalar_return && env->subprog_info[i].has_ld_abs) { - verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n"); - goto err_free; - } - if (i && !scalar_return && env->subprog_info[i].has_tail_call) { - verbose(env, "tail_call is only allowed in functions that return 'int'.\n"); - goto err_free; - } - - bpfptr_add(&urecord, urec_size); - } - - prog->aux->func_info_aux = info_aux; - return 0; - -err_free: - kfree(info_aux); - return ret; -} - static void adjust_btf_func(struct bpf_verifier_env *env) { struct bpf_prog_aux *aux = env->prog->aux; @@ -19205,414 +17245,6 @@ static void adjust_btf_func(struct bpf_verifier_env *env) aux->func_info[i].insn_off = env->subprog_info[i].start; } -#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col) -#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE - -static int check_btf_line(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) -{ - u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0; - struct bpf_subprog_info *sub; - struct bpf_line_info *linfo; - struct bpf_prog *prog; - const struct btf *btf; - bpfptr_t ulinfo; - int err; - - nr_linfo = attr->line_info_cnt; - if (!nr_linfo) - return 0; - if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info)) - return -EINVAL; - - rec_size = attr->line_info_rec_size; - if (rec_size < MIN_BPF_LINEINFO_SIZE || - rec_size > MAX_LINEINFO_REC_SIZE || - rec_size & (sizeof(u32) - 1)) - return -EINVAL; - - /* Need to zero it in case the userspace may - * pass in a smaller bpf_line_info object. - */ - linfo = kvzalloc_objs(struct bpf_line_info, nr_linfo, - GFP_KERNEL_ACCOUNT | __GFP_NOWARN); - if (!linfo) - return -ENOMEM; - - prog = env->prog; - btf = prog->aux->btf; - - s = 0; - sub = env->subprog_info; - ulinfo = make_bpfptr(attr->line_info, uattr.is_kernel); - expected_size = sizeof(struct bpf_line_info); - ncopy = min_t(u32, expected_size, rec_size); - for (i = 0; i < nr_linfo; i++) { - err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size); - if (err) { - if (err == -E2BIG) { - verbose(env, "nonzero tailing record in line_info"); - if (copy_to_bpfptr_offset(uattr, - offsetof(union bpf_attr, line_info_rec_size), - &expected_size, sizeof(expected_size))) - err = -EFAULT; - } - goto err_free; - } - - if (copy_from_bpfptr(&linfo[i], ulinfo, ncopy)) { - err = -EFAULT; - goto err_free; - } - - /* - * Check insn_off to ensure - * 1) strictly increasing AND - * 2) bounded by prog->len - * - * The linfo[0].insn_off == 0 check logically falls into - * the later "missing bpf_line_info for func..." case - * because the first linfo[0].insn_off must be the - * first sub also and the first sub must have - * subprog_info[0].start == 0. - */ - if ((i && linfo[i].insn_off <= prev_offset) || - linfo[i].insn_off >= prog->len) { - verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n", - i, linfo[i].insn_off, prev_offset, - prog->len); - err = -EINVAL; - goto err_free; - } - - if (!prog->insnsi[linfo[i].insn_off].code) { - verbose(env, - "Invalid insn code at line_info[%u].insn_off\n", - i); - err = -EINVAL; - goto err_free; - } - - if (!btf_name_by_offset(btf, linfo[i].line_off) || - !btf_name_by_offset(btf, linfo[i].file_name_off)) { - verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); - err = -EINVAL; - goto err_free; - } - - if (s != env->subprog_cnt) { - if (linfo[i].insn_off == sub[s].start) { - sub[s].linfo_idx = i; - s++; - } else if (sub[s].start < linfo[i].insn_off) { - verbose(env, "missing bpf_line_info for func#%u\n", s); - err = -EINVAL; - goto err_free; - } - } - - prev_offset = linfo[i].insn_off; - bpfptr_add(&ulinfo, rec_size); - } - - if (s != env->subprog_cnt) { - verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n", - env->subprog_cnt - s, s); - err = -EINVAL; - goto err_free; - } - - prog->aux->linfo = linfo; - prog->aux->nr_linfo = nr_linfo; - - return 0; - -err_free: - kvfree(linfo); - return err; -} - -#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo) -#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE - -static int check_core_relo(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) -{ - u32 i, nr_core_relo, ncopy, expected_size, rec_size; - struct bpf_core_relo core_relo = {}; - struct bpf_prog *prog = env->prog; - const struct btf *btf = prog->aux->btf; - struct bpf_core_ctx ctx = { - .log = &env->log, - .btf = btf, - }; - bpfptr_t u_core_relo; - int err; - - nr_core_relo = attr->core_relo_cnt; - if (!nr_core_relo) - return 0; - if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo)) - return -EINVAL; - - rec_size = attr->core_relo_rec_size; - if (rec_size < MIN_CORE_RELO_SIZE || - rec_size > MAX_CORE_RELO_SIZE || - rec_size % sizeof(u32)) - return -EINVAL; - - u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel); - expected_size = sizeof(struct bpf_core_relo); - ncopy = min_t(u32, expected_size, rec_size); - - /* Unlike func_info and line_info, copy and apply each CO-RE - * relocation record one at a time. - */ - for (i = 0; i < nr_core_relo; i++) { - /* future proofing when sizeof(bpf_core_relo) changes */ - err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size); - if (err) { - if (err == -E2BIG) { - verbose(env, "nonzero tailing record in core_relo"); - if (copy_to_bpfptr_offset(uattr, - offsetof(union bpf_attr, core_relo_rec_size), - &expected_size, sizeof(expected_size))) - err = -EFAULT; - } - break; - } - - if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) { - err = -EFAULT; - break; - } - - if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) { - verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n", - i, core_relo.insn_off, prog->len); - err = -EINVAL; - break; - } - - err = bpf_core_apply(&ctx, &core_relo, i, - &prog->insnsi[core_relo.insn_off / 8]); - if (err) - break; - bpfptr_add(&u_core_relo, rec_size); - } - return err; -} - -static int check_btf_info_early(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) -{ - struct btf *btf; - int err; - - if (!attr->func_info_cnt && !attr->line_info_cnt) { - if (check_abnormal_return(env)) - return -EINVAL; - return 0; - } - - btf = btf_get_by_fd(attr->prog_btf_fd); - if (IS_ERR(btf)) - return PTR_ERR(btf); - if (btf_is_kernel(btf)) { - btf_put(btf); - return -EACCES; - } - env->prog->aux->btf = btf; - - err = check_btf_func_early(env, attr, uattr); - if (err) - return err; - return 0; -} - -static int check_btf_info(struct bpf_verifier_env *env, - const union bpf_attr *attr, - bpfptr_t uattr) -{ - int err; - - if (!attr->func_info_cnt && !attr->line_info_cnt) { - if (check_abnormal_return(env)) - return -EINVAL; - return 0; - } - - err = check_btf_func(env, attr, uattr); - if (err) - return err; - - err = check_btf_line(env, attr, uattr); - if (err) - return err; - - err = check_core_relo(env, attr, uattr); - if (err) - return err; - - return 0; -} - -/* check %cur's range satisfies %old's */ -static bool range_within(const struct bpf_reg_state *old, - const struct bpf_reg_state *cur) -{ - return old->umin_value <= cur->umin_value && - old->umax_value >= cur->umax_value && - old->smin_value <= cur->smin_value && - old->smax_value >= cur->smax_value && - old->u32_min_value <= cur->u32_min_value && - old->u32_max_value >= cur->u32_max_value && - old->s32_min_value <= cur->s32_min_value && - old->s32_max_value >= cur->s32_max_value; -} - -/* If in the old state two registers had the same id, then they need to have - * the same id in the new state as well. But that id could be different from - * the old state, so we need to track the mapping from old to new ids. - * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent - * regs with old id 5 must also have new id 9 for the new state to be safe. But - * regs with a different old id could still have new id 9, we don't care about - * that. - * So we look through our idmap to see if this old id has been seen before. If - * so, we require the new id to match; otherwise, we add the id pair to the map. - */ -static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) -{ - struct bpf_id_pair *map = idmap->map; - unsigned int i; - - /* either both IDs should be set or both should be zero */ - if (!!old_id != !!cur_id) - return false; - - if (old_id == 0) /* cur_id == 0 as well */ - return true; - - for (i = 0; i < idmap->cnt; i++) { - if (map[i].old == old_id) - return map[i].cur == cur_id; - if (map[i].cur == cur_id) - return false; - } - - /* Reached the end of known mappings; haven't seen this id before */ - if (idmap->cnt < BPF_ID_MAP_SIZE) { - map[idmap->cnt].old = old_id; - map[idmap->cnt].cur = cur_id; - idmap->cnt++; - return true; - } - - /* We ran out of idmap slots, which should be impossible */ - WARN_ON_ONCE(1); - return false; -} - -/* - * Compare scalar register IDs for state equivalence. - * - * When old_id == 0, the old register is independent - not linked to any - * other register. Any linking in the current state only adds constraints, - * making it more restrictive. Since the old state didn't rely on any ID - * relationships for this register, it's always safe to accept cur regardless - * of its ID. Hence, return true immediately. - * - * When old_id != 0 but cur_id == 0, we need to ensure that different - * independent registers in cur don't incorrectly satisfy the ID matching - * requirements of linked registers in old. - * - * Example: if old has r6.id=X and r7.id=X (linked), but cur has r6.id=0 - * and r7.id=0 (both independent), without temp IDs both would map old_id=X - * to cur_id=0 and pass. With temp IDs: r6 maps X->temp1, r7 tries to map - * X->temp2, but X is already mapped to temp1, so the check fails correctly. - */ -static bool check_scalar_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) -{ - if (!old_id) - return true; - - cur_id = cur_id ? cur_id : ++idmap->tmp_id_gen; - - return check_ids(old_id, cur_id, idmap); -} - -static void clean_func_state(struct bpf_verifier_env *env, - struct bpf_func_state *st, - u32 ip) -{ - u16 live_regs = env->insn_aux_data[ip].live_regs_before; - int i, j; - - for (i = 0; i < BPF_REG_FP; i++) { - /* liveness must not touch this register anymore */ - if (!(live_regs & BIT(i))) - /* since the register is unused, clear its state - * to make further comparison simpler - */ - __mark_reg_not_init(env, &st->regs[i]); - } - - for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { - if (!bpf_stack_slot_alive(env, st->frameno, i)) { - __mark_reg_not_init(env, &st->stack[i].spilled_ptr); - for (j = 0; j < BPF_REG_SIZE; j++) - st->stack[i].slot_type[j] = STACK_INVALID; - } - } -} - -static void clean_verifier_state(struct bpf_verifier_env *env, - struct bpf_verifier_state *st) -{ - int i, ip; - - bpf_live_stack_query_init(env, st); - st->cleaned = true; - for (i = 0; i <= st->curframe; i++) { - ip = frame_insn_idx(st, i); - clean_func_state(env, st->frame[i], ip); - } -} - -/* the parentage chains form a tree. - * the verifier states are added to state lists at given insn and - * pushed into state stack for future exploration. - * when the verifier reaches bpf_exit insn some of the verifier states - * stored in the state lists have their final liveness state already, - * but a lot of states will get revised from liveness point of view when - * the verifier explores other branches. - * Example: - * 1: *(u64)(r10 - 8) = 1 - * 2: if r1 == 100 goto pc+1 - * 3: *(u64)(r10 - 8) = 2 - * 4: r0 = *(u64)(r10 - 8) - * 5: exit - * when the verifier reaches exit insn the stack slot -8 in the state list of - * insn 2 is not yet marked alive. Then the verifier pops the other_branch - * of insn 2 and goes exploring further. After the insn 4 read, liveness - * analysis would propagate read mark for -8 at insn 2. - * - * Since the verifier pushes the branch states as it sees them while exploring - * the program the condition of walking the branch instruction for the second - * time means that all states below this branch were already explored and - * their final liveness marks are already propagated. - * Hence when the verifier completes the search of state list in is_state_visited() - * we can call this clean_live_states() function to clear dead the registers and stack - * slots to simplify state merging. - * - * Important note here that walking the same branch instruction in the callee - * doesn't meant that the states are DONE. The verifier has to compare - * the callsites - */ - /* Find id in idset and increment its count, or add new entry */ static void idset_cnt_inc(struct bpf_idset *idset, u32 id) { @@ -19649,8 +17281,8 @@ static u32 idset_cnt_get(struct bpf_idset *idset, u32 id) * A register with a non-zero id is called singular if no other register shares * the same base id. Such registers can be treated as independent (id=0). */ -static void clear_singular_ids(struct bpf_verifier_env *env, - struct bpf_verifier_state *st) +void bpf_clear_singular_ids(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) { struct bpf_idset *idset = &env->idset_scratch; struct bpf_func_state *func; @@ -19671,1064 +17303,11 @@ static void clear_singular_ids(struct bpf_verifier_env *env, continue; if (!reg->id) continue; - if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) { - reg->id = 0; - reg->off = 0; - } + if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) + clear_scalar_id(reg); })); } -static void clean_live_states(struct bpf_verifier_env *env, int insn, - struct bpf_verifier_state *cur) -{ - struct bpf_verifier_state_list *sl; - struct list_head *pos, *head; - - head = explored_state(env, insn); - list_for_each(pos, head) { - sl = container_of(pos, struct bpf_verifier_state_list, node); - if (sl->state.branches) - continue; - if (sl->state.insn_idx != insn || - !same_callsites(&sl->state, cur)) - continue; - if (sl->state.cleaned) - /* all regs in this state in all frames were already marked */ - continue; - if (incomplete_read_marks(env, &sl->state)) - continue; - clean_verifier_state(env, &sl->state); - } -} - -static bool regs_exact(const struct bpf_reg_state *rold, - const struct bpf_reg_state *rcur, - struct bpf_idmap *idmap) -{ - return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && - check_ids(rold->id, rcur->id, idmap) && - check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); -} - -enum exact_level { - NOT_EXACT, - EXACT, - RANGE_WITHIN -}; - -/* Returns true if (rold safe implies rcur safe) */ -static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, - struct bpf_reg_state *rcur, struct bpf_idmap *idmap, - enum exact_level exact) -{ - if (exact == EXACT) - return regs_exact(rold, rcur, idmap); - - if (rold->type == NOT_INIT) - /* explored state can't have used this */ - return true; - - /* Enforce that register types have to match exactly, including their - * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general - * rule. - * - * One can make a point that using a pointer register as unbounded - * SCALAR would be technically acceptable, but this could lead to - * pointer leaks because scalars are allowed to leak while pointers - * are not. We could make this safe in special cases if root is - * calling us, but it's probably not worth the hassle. - * - * Also, register types that are *not* MAYBE_NULL could technically be - * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE - * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point - * to the same map). - * However, if the old MAYBE_NULL register then got NULL checked, - * doing so could have affected others with the same id, and we can't - * check for that because we lost the id when we converted to - * a non-MAYBE_NULL variant. - * So, as a general rule we don't allow mixing MAYBE_NULL and - * non-MAYBE_NULL registers as well. - */ - if (rold->type != rcur->type) - return false; - - switch (base_type(rold->type)) { - case SCALAR_VALUE: - if (env->explore_alu_limits) { - /* explore_alu_limits disables tnum_in() and range_within() - * logic and requires everything to be strict - */ - return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && - check_scalar_ids(rold->id, rcur->id, idmap); - } - if (!rold->precise && exact == NOT_EXACT) - return true; - /* - * Linked register tracking uses rold->id to detect relationships. - * When rold->id == 0, the register is independent and any linking - * in rcur only adds constraints. When rold->id != 0, we must verify - * id mapping and (for BPF_ADD_CONST) offset consistency. - * - * +------------------+-----------+------------------+---------------+ - * | | rold->id | rold + ADD_CONST | rold->id == 0 | - * |------------------+-----------+------------------+---------------| - * | rcur->id | range,ids | false | range | - * | rcur + ADD_CONST | false | range,ids,off | range | - * | rcur->id == 0 | range,ids | false | range | - * +------------------+-----------+------------------+---------------+ - * - * Why check_ids() for scalar registers? - * - * Consider the following BPF code: - * 1: r6 = ... unbound scalar, ID=a ... - * 2: r7 = ... unbound scalar, ID=b ... - * 3: if (r6 > r7) goto +1 - * 4: r6 = r7 - * 5: if (r6 > X) goto ... - * 6: ... memory operation using r7 ... - * - * First verification path is [1-6]: - * - at (4) same bpf_reg_state::id (b) would be assigned to r6 and r7; - * - at (5) r6 would be marked <= X, sync_linked_regs() would also mark - * r7 <= X, because r6 and r7 share same id. - * Next verification path is [1-4, 6]. - * - * Instruction (6) would be reached in two states: - * I. r6{.id=b}, r7{.id=b} via path 1-6; - * II. r6{.id=a}, r7{.id=b} via path 1-4, 6. - * - * Use check_ids() to distinguish these states. - * --- - * Also verify that new value satisfies old value range knowledge. - */ - - /* ADD_CONST mismatch: different linking semantics */ - if ((rold->id & BPF_ADD_CONST) && !(rcur->id & BPF_ADD_CONST)) - return false; - - if (rold->id && !(rold->id & BPF_ADD_CONST) && (rcur->id & BPF_ADD_CONST)) - return false; - - /* Both have offset linkage: offsets must match */ - if ((rold->id & BPF_ADD_CONST) && rold->off != rcur->off) - return false; - - if (!check_scalar_ids(rold->id, rcur->id, idmap)) - return false; - - return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); - case PTR_TO_MAP_KEY: - case PTR_TO_MAP_VALUE: - case PTR_TO_MEM: - case PTR_TO_BUF: - case PTR_TO_TP_BUFFER: - /* If the new min/max/var_off satisfy the old ones and - * everything else matches, we are OK. - */ - return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && - range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off) && - check_ids(rold->id, rcur->id, idmap) && - check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); - case PTR_TO_PACKET_META: - case PTR_TO_PACKET: - /* We must have at least as much range as the old ptr - * did, so that any accesses which were safe before are - * still safe. This is true even if old range < old off, - * since someone could have accessed through (ptr - k), or - * even done ptr -= k in a register, to get a safe access. - */ - if (rold->range > rcur->range) - return false; - /* If the offsets don't match, we can't trust our alignment; - * nor can we be sure that we won't fall out of range. - */ - if (rold->off != rcur->off) - return false; - /* id relations must be preserved */ - if (!check_ids(rold->id, rcur->id, idmap)) - return false; - /* new val must satisfy old val knowledge */ - return range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); - case PTR_TO_STACK: - /* two stack pointers are equal only if they're pointing to - * the same stack frame, since fp-8 in foo != fp-8 in bar - */ - return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno; - case PTR_TO_ARENA: - return true; - case PTR_TO_INSN: - return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && - rold->off == rcur->off && range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); - default: - return regs_exact(rold, rcur, idmap); - } -} - -static struct bpf_reg_state unbound_reg; - -static __init int unbound_reg_init(void) -{ - __mark_reg_unknown_imprecise(&unbound_reg); - return 0; -} -late_initcall(unbound_reg_init); - -static bool is_stack_all_misc(struct bpf_verifier_env *env, - struct bpf_stack_state *stack) -{ - u32 i; - - for (i = 0; i < ARRAY_SIZE(stack->slot_type); ++i) { - if ((stack->slot_type[i] == STACK_MISC) || - (stack->slot_type[i] == STACK_INVALID && env->allow_uninit_stack)) - continue; - return false; - } - - return true; -} - -static struct bpf_reg_state *scalar_reg_for_stack(struct bpf_verifier_env *env, - struct bpf_stack_state *stack) -{ - if (is_spilled_scalar_reg64(stack)) - return &stack->spilled_ptr; - - if (is_stack_all_misc(env, stack)) - return &unbound_reg; - - return NULL; -} - -static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, struct bpf_idmap *idmap, - enum exact_level exact) -{ - int i, spi; - - /* walk slots of the explored stack and ignore any additional - * slots in the current stack, since explored(safe) state - * didn't use them - */ - for (i = 0; i < old->allocated_stack; i++) { - struct bpf_reg_state *old_reg, *cur_reg; - - spi = i / BPF_REG_SIZE; - - if (exact == EXACT && - (i >= cur->allocated_stack || - old->stack[spi].slot_type[i % BPF_REG_SIZE] != - cur->stack[spi].slot_type[i % BPF_REG_SIZE])) - return false; - - if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) - continue; - - if (env->allow_uninit_stack && - old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC) - continue; - - /* explored stack has more populated slots than current stack - * and these slots were used - */ - if (i >= cur->allocated_stack) - return false; - - /* 64-bit scalar spill vs all slots MISC and vice versa. - * Load from all slots MISC produces unbound scalar. - * Construct a fake register for such stack and call - * regsafe() to ensure scalar ids are compared. - */ - old_reg = scalar_reg_for_stack(env, &old->stack[spi]); - cur_reg = scalar_reg_for_stack(env, &cur->stack[spi]); - if (old_reg && cur_reg) { - if (!regsafe(env, old_reg, cur_reg, idmap, exact)) - return false; - i += BPF_REG_SIZE - 1; - continue; - } - - /* if old state was safe with misc data in the stack - * it will be safe with zero-initialized stack. - * The opposite is not true - */ - if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && - cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) - continue; - if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != - cur->stack[spi].slot_type[i % BPF_REG_SIZE]) - /* Ex: old explored (safe) state has STACK_SPILL in - * this stack slot, but current has STACK_MISC -> - * this verifier states are not equivalent, - * return false to continue verification of this path - */ - return false; - if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1) - continue; - /* Both old and cur are having same slot_type */ - switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) { - case STACK_SPILL: - /* when explored and current stack slot are both storing - * spilled registers, check that stored pointers types - * are the same as well. - * Ex: explored safe path could have stored - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} - * but current path has stored: - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} - * such verifier states are not equivalent. - * return false to continue verification of this path - */ - if (!regsafe(env, &old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, idmap, exact)) - return false; - break; - case STACK_DYNPTR: - old_reg = &old->stack[spi].spilled_ptr; - cur_reg = &cur->stack[spi].spilled_ptr; - if (old_reg->dynptr.type != cur_reg->dynptr.type || - old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot || - !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) - return false; - break; - case STACK_ITER: - old_reg = &old->stack[spi].spilled_ptr; - cur_reg = &cur->stack[spi].spilled_ptr; - /* iter.depth is not compared between states as it - * doesn't matter for correctness and would otherwise - * prevent convergence; we maintain it only to prevent - * infinite loop check triggering, see - * iter_active_depths_differ() - */ - if (old_reg->iter.btf != cur_reg->iter.btf || - old_reg->iter.btf_id != cur_reg->iter.btf_id || - old_reg->iter.state != cur_reg->iter.state || - /* ignore {old_reg,cur_reg}->iter.depth, see above */ - !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) - return false; - break; - case STACK_IRQ_FLAG: - old_reg = &old->stack[spi].spilled_ptr; - cur_reg = &cur->stack[spi].spilled_ptr; - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || - old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) - return false; - break; - case STACK_MISC: - case STACK_ZERO: - case STACK_INVALID: - continue; - /* Ensure that new unhandled slot types return false by default */ - default: - return false; - } - } - return true; -} - -static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur, - struct bpf_idmap *idmap) -{ - int i; - - if (old->acquired_refs != cur->acquired_refs) - return false; - - if (old->active_locks != cur->active_locks) - return false; - - if (old->active_preempt_locks != cur->active_preempt_locks) - return false; - - if (old->active_rcu_locks != cur->active_rcu_locks) - return false; - - if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) - return false; - - if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) || - old->active_lock_ptr != cur->active_lock_ptr) - return false; - - for (i = 0; i < old->acquired_refs; i++) { - if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || - old->refs[i].type != cur->refs[i].type) - return false; - switch (old->refs[i].type) { - case REF_TYPE_PTR: - case REF_TYPE_IRQ: - break; - case REF_TYPE_LOCK: - case REF_TYPE_RES_LOCK: - case REF_TYPE_RES_LOCK_IRQ: - if (old->refs[i].ptr != cur->refs[i].ptr) - return false; - break; - default: - WARN_ONCE(1, "Unhandled enum type for reference state: %d\n", old->refs[i].type); - return false; - } - } - - return true; -} - -/* compare two verifier states - * - * all states stored in state_list are known to be valid, since - * verifier reached 'bpf_exit' instruction through them - * - * this function is called when verifier exploring different branches of - * execution popped from the state stack. If it sees an old state that has - * more strict register state and more strict stack state then this execution - * branch doesn't need to be explored further, since verifier already - * concluded that more strict state leads to valid finish. - * - * Therefore two states are equivalent if register state is more conservative - * and explored stack state is more conservative than the current one. - * Example: - * explored current - * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC) - * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC) - * - * In other words if current stack state (one being explored) has more - * valid slots than old one that already passed validation, it means - * the verifier can stop exploring and conclude that current state is valid too - * - * Similarly with registers. If explored state has register type as invalid - * whereas register type in current state is meaningful, it means that - * the current state will reach 'bpf_exit' instruction safely - */ -static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact) -{ - u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before; - u16 i; - - if (old->callback_depth > cur->callback_depth) - return false; - - for (i = 0; i < MAX_BPF_REG; i++) - if (((1 << i) & live_regs) && - !regsafe(env, &old->regs[i], &cur->regs[i], - &env->idmap_scratch, exact)) - return false; - - if (!stacksafe(env, old, cur, &env->idmap_scratch, exact)) - return false; - - return true; -} - -static void reset_idmap_scratch(struct bpf_verifier_env *env) -{ - struct bpf_idmap *idmap = &env->idmap_scratch; - - idmap->tmp_id_gen = env->id_gen; - idmap->cnt = 0; -} - -static bool states_equal(struct bpf_verifier_env *env, - struct bpf_verifier_state *old, - struct bpf_verifier_state *cur, - enum exact_level exact) -{ - u32 insn_idx; - int i; - - if (old->curframe != cur->curframe) - return false; - - reset_idmap_scratch(env); - - /* Verification state from speculative execution simulation - * must never prune a non-speculative execution one. - */ - if (old->speculative && !cur->speculative) - return false; - - if (old->in_sleepable != cur->in_sleepable) - return false; - - if (!refsafe(old, cur, &env->idmap_scratch)) - return false; - - /* for states to be equal callsites have to be the same - * and all frame states need to be equivalent - */ - for (i = 0; i <= old->curframe; i++) { - insn_idx = frame_insn_idx(old, i); - if (old->frame[i]->callsite != cur->frame[i]->callsite) - return false; - if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact)) - return false; - } - return true; -} - -/* find precise scalars in the previous equivalent state and - * propagate them into the current state - */ -static int propagate_precision(struct bpf_verifier_env *env, - const struct bpf_verifier_state *old, - struct bpf_verifier_state *cur, - bool *changed) -{ - struct bpf_reg_state *state_reg; - struct bpf_func_state *state; - int i, err = 0, fr; - bool first; - - for (fr = old->curframe; fr >= 0; fr--) { - state = old->frame[fr]; - state_reg = state->regs; - first = true; - for (i = 0; i < BPF_REG_FP; i++, state_reg++) { - if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) - continue; - if (env->log.level & BPF_LOG_LEVEL2) { - if (first) - verbose(env, "frame %d: propagating r%d", fr, i); - else - verbose(env, ",r%d", i); - } - bt_set_frame_reg(&env->bt, fr, i); - first = false; - } - - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (!is_spilled_reg(&state->stack[i])) - continue; - state_reg = &state->stack[i].spilled_ptr; - if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) - continue; - if (env->log.level & BPF_LOG_LEVEL2) { - if (first) - verbose(env, "frame %d: propagating fp%d", - fr, (-i - 1) * BPF_REG_SIZE); - else - verbose(env, ",fp%d", (-i - 1) * BPF_REG_SIZE); - } - bt_set_frame_slot(&env->bt, fr, i); - first = false; - } - if (!first && (env->log.level & BPF_LOG_LEVEL2)) - verbose(env, "\n"); - } - - err = __mark_chain_precision(env, cur, -1, changed); - if (err < 0) - return err; - - return 0; -} - -#define MAX_BACKEDGE_ITERS 64 - -/* Propagate read and precision marks from visit->backedges[*].state->equal_state - * to corresponding parent states of visit->backedges[*].state until fixed point is reached, - * then free visit->backedges. - * After execution of this function incomplete_read_marks() will return false - * for all states corresponding to @visit->callchain. - */ -static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit) -{ - struct bpf_scc_backedge *backedge; - struct bpf_verifier_state *st; - bool changed; - int i, err; - - i = 0; - do { - if (i++ > MAX_BACKEDGE_ITERS) { - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "%s: too many iterations\n", __func__); - for (backedge = visit->backedges; backedge; backedge = backedge->next) - mark_all_scalars_precise(env, &backedge->state); - break; - } - changed = false; - for (backedge = visit->backedges; backedge; backedge = backedge->next) { - st = &backedge->state; - err = propagate_precision(env, st->equal_state, st, &changed); - if (err) - return err; - } - } while (changed); - - free_backedges(visit); - return 0; -} - -static bool states_maybe_looping(struct bpf_verifier_state *old, - struct bpf_verifier_state *cur) -{ - struct bpf_func_state *fold, *fcur; - int i, fr = cur->curframe; - - if (old->curframe != fr) - return false; - - fold = old->frame[fr]; - fcur = cur->frame[fr]; - for (i = 0; i < MAX_BPF_REG; i++) - if (memcmp(&fold->regs[i], &fcur->regs[i], - offsetof(struct bpf_reg_state, frameno))) - return false; - return true; -} - -static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) -{ - return env->insn_aux_data[insn_idx].is_iter_next; -} - -/* is_state_visited() handles iter_next() (see process_iter_next_call() for - * terminology) calls specially: as opposed to bounded BPF loops, it *expects* - * states to match, which otherwise would look like an infinite loop. So while - * iter_next() calls are taken care of, we still need to be careful and - * prevent erroneous and too eager declaration of "infinite loop", when - * iterators are involved. - * - * Here's a situation in pseudo-BPF assembly form: - * - * 0: again: ; set up iter_next() call args - * 1: r1 = &it ; <CHECKPOINT HERE> - * 2: call bpf_iter_num_next ; this is iter_next() call - * 3: if r0 == 0 goto done - * 4: ... something useful here ... - * 5: goto again ; another iteration - * 6: done: - * 7: r1 = &it - * 8: call bpf_iter_num_destroy ; clean up iter state - * 9: exit - * - * This is a typical loop. Let's assume that we have a prune point at 1:, - * before we get to `call bpf_iter_num_next` (e.g., because of that `goto - * again`, assuming other heuristics don't get in a way). - * - * When we first time come to 1:, let's say we have some state X. We proceed - * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit. - * Now we come back to validate that forked ACTIVE state. We proceed through - * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we - * are converging. But the problem is that we don't know that yet, as this - * convergence has to happen at iter_next() call site only. So if nothing is - * done, at 1: verifier will use bounded loop logic and declare infinite - * looping (and would be *technically* correct, if not for iterator's - * "eventual sticky NULL" contract, see process_iter_next_call()). But we - * don't want that. So what we do in process_iter_next_call() when we go on - * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's - * a different iteration. So when we suspect an infinite loop, we additionally - * check if any of the *ACTIVE* iterator states depths differ. If yes, we - * pretend we are not looping and wait for next iter_next() call. - * - * This only applies to ACTIVE state. In DRAINED state we don't expect to - * loop, because that would actually mean infinite loop, as DRAINED state is - * "sticky", and so we'll keep returning into the same instruction with the - * same state (at least in one of possible code paths). - * - * This approach allows to keep infinite loop heuristic even in the face of - * active iterator. E.g., C snippet below is and will be detected as - * infinitely looping: - * - * struct bpf_iter_num it; - * int *p, x; - * - * bpf_iter_num_new(&it, 0, 10); - * while ((p = bpf_iter_num_next(&t))) { - * x = p; - * while (x--) {} // <<-- infinite loop here - * } - * - */ -static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) -{ - struct bpf_reg_state *slot, *cur_slot; - struct bpf_func_state *state; - int i, fr; - - for (fr = old->curframe; fr >= 0; fr--) { - state = old->frame[fr]; - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_ITER) - continue; - - slot = &state->stack[i].spilled_ptr; - if (slot->iter.state != BPF_ITER_STATE_ACTIVE) - continue; - - cur_slot = &cur->frame[fr]->stack[i].spilled_ptr; - if (cur_slot->iter.depth != slot->iter.depth) - return true; - } - } - return false; -} - -static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) -{ - struct bpf_verifier_state_list *new_sl; - struct bpf_verifier_state_list *sl; - struct bpf_verifier_state *cur = env->cur_state, *new; - bool force_new_state, add_new_state, loop; - int n, err, states_cnt = 0; - struct list_head *pos, *tmp, *head; - - force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || - /* Avoid accumulating infinitely long jmp history */ - cur->jmp_history_cnt > 40; - - /* bpf progs typically have pruning point every 4 instructions - * http://vger.kernel.org/bpfconf2019.html#session-1 - * Do not add new state for future pruning if the verifier hasn't seen - * at least 2 jumps and at least 8 instructions. - * This heuristics helps decrease 'total_states' and 'peak_states' metric. - * In tests that amounts to up to 50% reduction into total verifier - * memory consumption and 20% verifier time speedup. - */ - add_new_state = force_new_state; - if (env->jmps_processed - env->prev_jmps_processed >= 2 && - env->insn_processed - env->prev_insn_processed >= 8) - add_new_state = true; - - clean_live_states(env, insn_idx, cur); - - loop = false; - head = explored_state(env, insn_idx); - list_for_each_safe(pos, tmp, head) { - sl = container_of(pos, struct bpf_verifier_state_list, node); - states_cnt++; - if (sl->state.insn_idx != insn_idx) - continue; - - if (sl->state.branches) { - struct bpf_func_state *frame = sl->state.frame[sl->state.curframe]; - - if (frame->in_async_callback_fn && - frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) { - /* Different async_entry_cnt means that the verifier is - * processing another entry into async callback. - * Seeing the same state is not an indication of infinite - * loop or infinite recursion. - * But finding the same state doesn't mean that it's safe - * to stop processing the current state. The previous state - * hasn't yet reached bpf_exit, since state.branches > 0. - * Checking in_async_callback_fn alone is not enough either. - * Since the verifier still needs to catch infinite loops - * inside async callbacks. - */ - goto skip_inf_loop_check; - } - /* BPF open-coded iterators loop detection is special. - * states_maybe_looping() logic is too simplistic in detecting - * states that *might* be equivalent, because it doesn't know - * about ID remapping, so don't even perform it. - * See process_iter_next_call() and iter_active_depths_differ() - * for overview of the logic. When current and one of parent - * states are detected as equivalent, it's a good thing: we prove - * convergence and can stop simulating further iterations. - * It's safe to assume that iterator loop will finish, taking into - * account iter_next() contract of eventually returning - * sticky NULL result. - * - * Note, that states have to be compared exactly in this case because - * read and precision marks might not be finalized inside the loop. - * E.g. as in the program below: - * - * 1. r7 = -16 - * 2. r6 = bpf_get_prandom_u32() - * 3. while (bpf_iter_num_next(&fp[-8])) { - * 4. if (r6 != 42) { - * 5. r7 = -32 - * 6. r6 = bpf_get_prandom_u32() - * 7. continue - * 8. } - * 9. r0 = r10 - * 10. r0 += r7 - * 11. r8 = *(u64 *)(r0 + 0) - * 12. r6 = bpf_get_prandom_u32() - * 13. } - * - * Here verifier would first visit path 1-3, create a checkpoint at 3 - * with r7=-16, continue to 4-7,3. Existing checkpoint at 3 does - * not have read or precision mark for r7 yet, thus inexact states - * comparison would discard current state with r7=-32 - * => unsafe memory access at 11 would not be caught. - */ - if (is_iter_next_insn(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - struct bpf_func_state *cur_frame; - struct bpf_reg_state *iter_state, *iter_reg; - int spi; - - cur_frame = cur->frame[cur->curframe]; - /* btf_check_iter_kfuncs() enforces that - * iter state pointer is always the first arg - */ - iter_reg = &cur_frame->regs[BPF_REG_1]; - /* current state is valid due to states_equal(), - * so we can assume valid iter and reg state, - * no need for extra (re-)validations - */ - spi = __get_spi(iter_reg->off + iter_reg->var_off.value); - iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; - if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { - loop = true; - goto hit; - } - } - goto skip_inf_loop_check; - } - if (is_may_goto_insn_at(env, insn_idx)) { - if (sl->state.may_goto_depth != cur->may_goto_depth && - states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - loop = true; - goto hit; - } - } - if (bpf_calls_callback(env, insn_idx)) { - if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - loop = true; - goto hit; - } - goto skip_inf_loop_check; - } - /* attempt to detect infinite loop to avoid unnecessary doomed work */ - if (states_maybe_looping(&sl->state, cur) && - states_equal(env, &sl->state, cur, EXACT) && - !iter_active_depths_differ(&sl->state, cur) && - sl->state.may_goto_depth == cur->may_goto_depth && - sl->state.callback_unroll_depth == cur->callback_unroll_depth) { - verbose_linfo(env, insn_idx, "; "); - verbose(env, "infinite loop detected at insn %d\n", insn_idx); - verbose(env, "cur state:"); - print_verifier_state(env, cur, cur->curframe, true); - verbose(env, "old state:"); - print_verifier_state(env, &sl->state, cur->curframe, true); - return -EINVAL; - } - /* if the verifier is processing a loop, avoid adding new state - * too often, since different loop iterations have distinct - * states and may not help future pruning. - * This threshold shouldn't be too low to make sure that - * a loop with large bound will be rejected quickly. - * The most abusive loop will be: - * r1 += 1 - * if r1 < 1000000 goto pc-2 - * 1M insn_procssed limit / 100 == 10k peak states. - * This threshold shouldn't be too high either, since states - * at the end of the loop are likely to be useful in pruning. - */ -skip_inf_loop_check: - if (!force_new_state && - env->jmps_processed - env->prev_jmps_processed < 20 && - env->insn_processed - env->prev_insn_processed < 100) - add_new_state = false; - goto miss; - } - /* See comments for mark_all_regs_read_and_precise() */ - loop = incomplete_read_marks(env, &sl->state); - if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { -hit: - sl->hit_cnt++; - - /* if previous state reached the exit with precision and - * current state is equivalent to it (except precision marks) - * the precision needs to be propagated back in - * the current state. - */ - err = 0; - if (is_jmp_point(env, env->insn_idx)) - err = push_jmp_history(env, cur, 0, 0); - err = err ? : propagate_precision(env, &sl->state, cur, NULL); - if (err) - return err; - /* When processing iterator based loops above propagate_liveness and - * propagate_precision calls are not sufficient to transfer all relevant - * read and precision marks. E.g. consider the following case: - * - * .-> A --. Assume the states are visited in the order A, B, C. - * | | | Assume that state B reaches a state equivalent to state A. - * | v v At this point, state C is not processed yet, so state A - * '-- B C has not received any read or precision marks from C. - * Thus, marks propagated from A to B are incomplete. - * - * The verifier mitigates this by performing the following steps: - * - * - Prior to the main verification pass, strongly connected components - * (SCCs) are computed over the program's control flow graph, - * intraprocedurally. - * - * - During the main verification pass, `maybe_enter_scc()` checks - * whether the current verifier state is entering an SCC. If so, an - * instance of a `bpf_scc_visit` object is created, and the state - * entering the SCC is recorded as the entry state. - * - * - This instance is associated not with the SCC itself, but with a - * `bpf_scc_callchain`: a tuple consisting of the call sites leading to - * the SCC and the SCC id. See `compute_scc_callchain()`. - * - * - When a verification path encounters a `states_equal(..., - * RANGE_WITHIN)` condition, there exists a call chain describing the - * current state and a corresponding `bpf_scc_visit` instance. A copy - * of the current state is created and added to - * `bpf_scc_visit->backedges`. - * - * - When a verification path terminates, `maybe_exit_scc()` is called - * from `update_branch_counts()`. For states with `branches == 0`, it - * checks whether the state is the entry state of any `bpf_scc_visit` - * instance. If it is, this indicates that all paths originating from - * this SCC visit have been explored. `propagate_backedges()` is then - * called, which propagates read and precision marks through the - * backedges until a fixed point is reached. - * (In the earlier example, this would propagate marks from A to B, - * from C to A, and then again from A to B.) - * - * A note on callchains - * -------------------- - * - * Consider the following example: - * - * void foo() { loop { ... SCC#1 ... } } - * void main() { - * A: foo(); - * B: ... - * C: foo(); - * } - * - * Here, there are two distinct callchains leading to SCC#1: - * - (A, SCC#1) - * - (C, SCC#1) - * - * Each callchain identifies a separate `bpf_scc_visit` instance that - * accumulates backedge states. The `propagate_{liveness,precision}()` - * functions traverse the parent state of each backedge state, which - * means these parent states must remain valid (i.e., not freed) while - * the corresponding `bpf_scc_visit` instance exists. - * - * Associating `bpf_scc_visit` instances directly with SCCs instead of - * callchains would break this invariant: - * - States explored during `C: foo()` would contribute backedges to - * SCC#1, but SCC#1 would only be exited once the exploration of - * `A: foo()` completes. - * - By that time, the states explored between `A: foo()` and `C: foo()` - * (i.e., `B: ...`) may have already been freed, causing the parent - * links for states from `C: foo()` to become invalid. - */ - if (loop) { - struct bpf_scc_backedge *backedge; - - backedge = kzalloc_obj(*backedge, - GFP_KERNEL_ACCOUNT); - if (!backedge) - return -ENOMEM; - err = copy_verifier_state(&backedge->state, cur); - backedge->state.equal_state = &sl->state; - backedge->state.insn_idx = insn_idx; - err = err ?: add_scc_backedge(env, &sl->state, backedge); - if (err) { - free_verifier_state(&backedge->state, false); - kfree(backedge); - return err; - } - } - return 1; - } -miss: - /* when new state is not going to be added do not increase miss count. - * Otherwise several loop iterations will remove the state - * recorded earlier. The goal of these heuristics is to have - * states from some iterations of the loop (some in the beginning - * and some at the end) to help pruning. - */ - if (add_new_state) - sl->miss_cnt++; - /* heuristic to determine whether this state is beneficial - * to keep checking from state equivalence point of view. - * Higher numbers increase max_states_per_insn and verification time, - * but do not meaningfully decrease insn_processed. - * 'n' controls how many times state could miss before eviction. - * Use bigger 'n' for checkpoints because evicting checkpoint states - * too early would hinder iterator convergence. - */ - n = is_force_checkpoint(env, insn_idx) && sl->state.branches > 0 ? 64 : 3; - if (sl->miss_cnt > sl->hit_cnt * n + n) { - /* the state is unlikely to be useful. Remove it to - * speed up verification - */ - sl->in_free_list = true; - list_del(&sl->node); - list_add(&sl->node, &env->free_list); - env->free_list_size++; - env->explored_states_size--; - maybe_free_verifier_state(env, sl); - } - } - - if (env->max_states_per_insn < states_cnt) - env->max_states_per_insn = states_cnt; - - if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) - return 0; - - if (!add_new_state) - return 0; - - /* There were no equivalent states, remember the current one. - * Technically the current state is not proven to be safe yet, - * but it will either reach outer most bpf_exit (which means it's safe) - * or it will be rejected. When there are no loops the verifier won't be - * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) - * again on the way to bpf_exit. - * When looping the sl->state.branches will be > 0 and this state - * will not be considered for equivalence until branches == 0. - */ - new_sl = kzalloc_obj(struct bpf_verifier_state_list, GFP_KERNEL_ACCOUNT); - if (!new_sl) - return -ENOMEM; - env->total_states++; - env->explored_states_size++; - update_peak_states(env); - env->prev_jmps_processed = env->jmps_processed; - env->prev_insn_processed = env->insn_processed; - - /* forget precise markings we inherited, see __mark_chain_precision */ - if (env->bpf_capable) - mark_all_scalars_imprecise(env, cur); - - clear_singular_ids(env, cur); - - /* add new state to the head of linked list */ - new = &new_sl->state; - err = copy_verifier_state(new, cur); - if (err) { - free_verifier_state(new, false); - kfree(new_sl); - return err; - } - new->insn_idx = insn_idx; - verifier_bug_if(new->branches != 1, env, - "%s:branches_to_explore=%d insn %d", - __func__, new->branches, insn_idx); - err = maybe_enter_scc(env, new); - if (err) { - free_verifier_state(new, false); - kfree(new_sl); - return err; - } - - cur->parent = new; - cur->first_insn_idx = insn_idx; - cur->dfs_depth = new->dfs_depth + 1; - clear_jmp_history(cur); - list_add(&new_sl->node, head); - return 0; -} - /* Return true if it's OK to have the same insn return a different type. */ static bool reg_type_mismatch_ok(enum bpf_reg_type type) { @@ -20829,13 +17408,16 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ } enum { - PROCESS_BPF_EXIT = 1 + PROCESS_BPF_EXIT = 1, + INSN_IDX_UPDATED = 2, }; static int process_bpf_exit_full(struct bpf_verifier_env *env, bool *do_print_state, bool exception_exit) { + struct bpf_func_state *cur_frame = cur_func(env); + /* We must do check_reference_leak here before * prepare_func_exit to handle the case when * state->curframe > 0, it may be a callback function, @@ -20843,7 +17425,8 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env, * state when it exits. */ int err = check_resource_leak(env, exception_exit, - !env->cur_state->curframe, + exception_exit || !env->cur_state->curframe, + exception_exit ? "bpf_throw" : "BPF_EXIT instruction in main prog"); if (err) return err; @@ -20866,10 +17449,24 @@ static int process_bpf_exit_full(struct bpf_verifier_env *env, if (err) return err; *do_print_state = true; - return 0; + return INSN_IDX_UPDATED; } - err = check_return_code(env, BPF_REG_0, "R0"); + /* + * Return from a regular global subprogram differs from return + * from the main program or async/exception callback. + * Main program exit implies return code restrictions + * that depend on program type. + * Exit from exception callback is equivalent to main program exit. + * Exit from async callback implies return code restrictions + * that depend on async scheduling mechanism. + */ + if (cur_frame->subprogno && + !cur_frame->in_async_callback_fn && + !cur_frame->in_exception_callback_fn) + err = check_global_subprog_return_code(env); + else + err = check_return_code(env, BPF_REG_0, "R0"); if (err) return err; return PROCESS_BPF_EXIT; @@ -20881,19 +17478,16 @@ static int indirect_jump_min_max_index(struct bpf_verifier_env *env, u32 *pmin_index, u32 *pmax_index) { struct bpf_reg_state *reg = reg_state(env, regno); - u64 min_index, max_index; + u64 min_index = reg->umin_value; + u64 max_index = reg->umax_value; const u32 size = 8; - if (check_add_overflow(reg->umin_value, reg->off, &min_index) || - (min_index > (u64) U32_MAX * size)) { - verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n", - regno, reg->umin_value, reg->off); + if (min_index > (u64) U32_MAX * size) { + verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value); return -ERANGE; } - if (check_add_overflow(reg->umax_value, reg->off, &max_index) || - (max_index > (u64) U32_MAX * size)) { - verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n", - regno, reg->umax_value, reg->off); + if (max_index > (u64) U32_MAX * size) { + verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value); return -ERANGE; } @@ -20943,13 +17537,13 @@ static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *in /* Ensure that the buffer is large enough */ if (!env->gotox_tmp_buf || env->gotox_tmp_buf->cnt < max_index - min_index + 1) { - env->gotox_tmp_buf = iarray_realloc(env->gotox_tmp_buf, - max_index - min_index + 1); + env->gotox_tmp_buf = bpf_iarray_realloc(env->gotox_tmp_buf, + max_index - min_index + 1); if (!env->gotox_tmp_buf) return -ENOMEM; } - n = copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items); + n = bpf_copy_insn_array_uniq(map, min_index, max_index, env->gotox_tmp_buf->items); if (n < 0) return n; if (n == 0) { @@ -20959,13 +17553,15 @@ static int check_indirect_jump(struct bpf_verifier_env *env, struct bpf_insn *in } for (i = 0; i < n - 1; i++) { + mark_indirect_target(env, env->gotox_tmp_buf->items[i]); other_branch = push_stack(env, env->gotox_tmp_buf->items[i], env->insn_idx, env->cur_state->speculative); if (IS_ERR(other_branch)) return PTR_ERR(other_branch); } env->insn_idx = env->gotox_tmp_buf->items[n-1]; - return 0; + mark_indirect_target(env, env->insn_idx); + return INSN_IDX_UPDATED; } static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) @@ -20974,81 +17570,48 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) struct bpf_insn *insn = &env->prog->insnsi[env->insn_idx]; u8 class = BPF_CLASS(insn->code); - if (class == BPF_ALU || class == BPF_ALU64) { - err = check_alu_op(env, insn); - if (err) - return err; - - } else if (class == BPF_LDX) { - bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; + switch (class) { + case BPF_ALU: + case BPF_ALU64: + return check_alu_op(env, insn); - /* Check for reserved fields is already done in - * resolve_pseudo_ldimm64(). - */ - err = check_load_mem(env, insn, false, is_ldsx, true, "ldx"); - if (err) - return err; - } else if (class == BPF_STX) { - if (BPF_MODE(insn->code) == BPF_ATOMIC) { - err = check_atomic(env, insn); - if (err) - return err; - env->insn_idx++; - return 0; - } + case BPF_LDX: + return check_load_mem(env, insn, false, + BPF_MODE(insn->code) == BPF_MEMSX, + true, "ldx"); - if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } + case BPF_STX: + if (BPF_MODE(insn->code) == BPF_ATOMIC) + return check_atomic(env, insn); + return check_store_reg(env, insn, false); - err = check_store_reg(env, insn, false); - if (err) - return err; - } else if (class == BPF_ST) { + case BPF_ST: { enum bpf_reg_type dst_reg_type; - if (BPF_MODE(insn->code) != BPF_MEM || - insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_ST uses reserved fields\n"); - return -EINVAL; - } - /* check src operand */ err = check_reg_arg(env, insn->dst_reg, SRC_OP); if (err) return err; dst_reg_type = cur_regs(env)[insn->dst_reg].type; - /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, false, false); if (err) return err; - err = save_aux_ptr_type(env, dst_reg_type, false); - if (err) - return err; - } else if (class == BPF_JMP || class == BPF_JMP32) { + return save_aux_ptr_type(env, dst_reg_type, false); + } + case BPF_JMP: + case BPF_JMP32: { u8 opcode = BPF_OP(insn->code); env->jmps_processed++; if (opcode == BPF_CALL) { - if (BPF_SRC(insn->code) != BPF_K || - (insn->src_reg != BPF_PSEUDO_KFUNC_CALL && - insn->off != 0) || - (insn->src_reg != BPF_REG_0 && - insn->src_reg != BPF_PSEUDO_CALL && - insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || - insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { - verbose(env, "BPF_CALL uses reserved fields\n"); - return -EINVAL; - } - if (env->cur_state->active_locks) { if ((insn->src_reg == BPF_REG_0 && - insn->imm != BPF_FUNC_spin_unlock) || + insn->imm != BPF_FUNC_spin_unlock && + insn->imm != BPF_FUNC_kptr_xchg) || (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) { verbose(env, @@ -21056,84 +17619,45 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) return -EINVAL; } } - if (insn->src_reg == BPF_PSEUDO_CALL) { - err = check_func_call(env, insn, &env->insn_idx); - } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { - err = check_kfunc_call(env, insn, &env->insn_idx); - if (!err && is_bpf_throw_kfunc(insn)) - return process_bpf_exit_full(env, do_print_state, true); - } else { - err = check_helper_call(env, insn, &env->insn_idx); - } - if (err) - return err; - mark_reg_scratched(env, BPF_REG_0); + if (insn->src_reg == BPF_PSEUDO_CALL) + return check_func_call(env, insn, &env->insn_idx); + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + return check_kfunc_call(env, insn, &env->insn_idx); + return check_helper_call(env, insn, &env->insn_idx); } else if (opcode == BPF_JA) { - if (BPF_SRC(insn->code) == BPF_X) { - if (insn->src_reg != BPF_REG_0 || - insn->imm != 0 || insn->off != 0) { - verbose(env, "BPF_JA|BPF_X uses reserved fields\n"); - return -EINVAL; - } + if (BPF_SRC(insn->code) == BPF_X) return check_indirect_jump(env, insn); - } - - if (BPF_SRC(insn->code) != BPF_K || - insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0 || - (class == BPF_JMP && insn->imm != 0) || - (class == BPF_JMP32 && insn->off != 0)) { - verbose(env, "BPF_JA uses reserved fields\n"); - return -EINVAL; - } if (class == BPF_JMP) env->insn_idx += insn->off + 1; else env->insn_idx += insn->imm + 1; - return 0; + return INSN_IDX_UPDATED; } else if (opcode == BPF_EXIT) { - if (BPF_SRC(insn->code) != BPF_K || - insn->imm != 0 || - insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0 || - class == BPF_JMP32) { - verbose(env, "BPF_EXIT uses reserved fields\n"); - return -EINVAL; - } return process_bpf_exit_full(env, do_print_state, false); - } else { - err = check_cond_jmp_op(env, insn, &env->insn_idx); - if (err) - return err; } - } else if (class == BPF_LD) { + return check_cond_jmp_op(env, insn, &env->insn_idx); + } + case BPF_LD: { u8 mode = BPF_MODE(insn->code); - if (mode == BPF_ABS || mode == BPF_IND) { - err = check_ld_abs(env, insn); - if (err) - return err; + if (mode == BPF_ABS || mode == BPF_IND) + return check_ld_abs(env, insn); - } else if (mode == BPF_IMM) { + if (mode == BPF_IMM) { err = check_ld_imm(env, insn); if (err) return err; env->insn_idx++; sanitize_mark_insn_seen(env); - } else { - verbose(env, "invalid BPF_LD mode\n"); - return -EINVAL; } - } else { - verbose(env, "unknown insn class %d\n", class); - return -EINVAL; + return 0; } - - env->insn_idx++; - return 0; + } + /* all class values are handled above. silence compiler warning */ + return -EFAULT; } static int do_check(struct bpf_verifier_env *env) @@ -21148,7 +17672,7 @@ static int do_check(struct bpf_verifier_env *env) for (;;) { struct bpf_insn *insn; struct bpf_insn_aux_data *insn_aux; - int err, marks_err; + int err; /* reset current history entry on each new instruction */ env->cur_hist_ent = NULL; @@ -21173,8 +17697,8 @@ static int do_check(struct bpf_verifier_env *env) state->last_insn_idx = env->prev_insn_idx; state->insn_idx = env->insn_idx; - if (is_prune_point(env, env->insn_idx)) { - err = is_state_visited(env, env->insn_idx); + if (bpf_is_prune_point(env, env->insn_idx)) { + err = bpf_is_state_visited(env, env->insn_idx); if (err < 0) return err; if (err == 1) { @@ -21192,8 +17716,8 @@ static int do_check(struct bpf_verifier_env *env) } } - if (is_jmp_point(env, env->insn_idx)) { - err = push_jmp_history(env, state, 0, 0); + if (bpf_is_jmp_point(env, env->insn_idx)) { + err = bpf_push_jmp_history(env, state, 0, 0); if (err) return err; } @@ -21220,7 +17744,7 @@ static int do_check(struct bpf_verifier_env *env) verbose_linfo(env, env->insn_idx, "; "); env->prev_log_pos = env->log.end_pos; verbose(env, "%d: ", env->insn_idx); - verbose_insn(env, insn); + bpf_verbose_insn(env, insn); env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos; env->prev_log_pos = env->log.end_pos; } @@ -21235,21 +17759,34 @@ static int do_check(struct bpf_verifier_env *env) sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; + /* Sanity check: precomputed constants must match verifier state */ + if (!state->speculative && insn_aux->const_reg_mask) { + struct bpf_reg_state *regs = cur_regs(env); + u16 mask = insn_aux->const_reg_mask; + + for (int r = 0; r < ARRAY_SIZE(insn_aux->const_reg_vals); r++) { + u32 cval = insn_aux->const_reg_vals[r]; + + if (!(mask & BIT(r))) + continue; + if (regs[r].type != SCALAR_VALUE) + continue; + if (!tnum_is_const(regs[r].var_off)) + continue; + if (verifier_bug_if((u32)regs[r].var_off.value != cval, + env, "const R%d: %u != %llu", + r, cval, regs[r].var_off.value)) + return -EFAULT; + } + } + /* Reduce verification complexity by stopping speculative path * verification when a nospec is encountered. */ if (state->speculative && insn_aux->nospec) goto process_bpf_exit; - err = bpf_reset_stack_write_marks(env, env->insn_idx); - if (err) - return err; err = do_check_insn(env, &do_print_state); - if (err >= 0 || error_recoverable_with_nospec(err)) { - marks_err = bpf_commit_stack_write_marks(env); - if (marks_err) - return marks_err; - } if (error_recoverable_with_nospec(err) && state->speculative) { /* Prevent this speculative path from ever reaching the * insn that would have been unsafe to execute. @@ -21264,8 +17801,10 @@ static int do_check(struct bpf_verifier_env *env) return err; } else if (err == PROCESS_BPF_EXIT) { goto process_bpf_exit; + } else if (err == INSN_IDX_UPDATED) { + } else if (err == 0) { + env->insn_idx++; } - WARN_ON_ONCE(err); if (state->speculative && insn_aux->nospec_result) { /* If we are on a path that performed a jump-op, this @@ -21289,10 +17828,7 @@ static int do_check(struct bpf_verifier_env *env) return -EFAULT; process_bpf_exit: mark_verifier_state_scratched(env); - err = update_branch_counts(env, env->cur_state); - if (err) - return err; - err = bpf_update_live_stack(env); + err = bpf_update_branch_counts(env, env->cur_state); if (err) return err; err = pop_stack(env, &prev_insn_idx, &env->insn_idx, @@ -21685,14 +18221,199 @@ static int add_used_map(struct bpf_verifier_env *env, int fd) return __add_used_map(env, map); } -/* find and rewrite pseudo imm in ld_imm64 instructions: +static int check_alu_fields(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + switch (opcode) { + case BPF_NEG: + if (BPF_SRC(insn->code) != BPF_K || insn->src_reg != BPF_REG_0 || + insn->off != 0 || insn->imm != 0) { + verbose(env, "BPF_NEG uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_END: + if (insn->src_reg != BPF_REG_0 || insn->off != 0 || + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || + (class == BPF_ALU64 && BPF_SRC(insn->code) != BPF_TO_LE)) { + verbose(env, "BPF_END uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_MOV: + if (BPF_SRC(insn->code) == BPF_X) { + if (class == BPF_ALU) { + if ((insn->off != 0 && insn->off != 8 && insn->off != 16) || + insn->imm) { + verbose(env, "BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } else if (insn->off == BPF_ADDR_SPACE_CAST) { + if (insn->imm != 1 && insn->imm != 1u << 16) { + verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n"); + return -EINVAL; + } + } else if ((insn->off != 0 && insn->off != 8 && + insn->off != 16 && insn->off != 32) || insn->imm) { + verbose(env, "BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } else if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose(env, "BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_ADD: + case BPF_SUB: + case BPF_AND: + case BPF_OR: + case BPF_XOR: + case BPF_LSH: + case BPF_RSH: + case BPF_ARSH: + case BPF_MUL: + case BPF_DIV: + case BPF_MOD: + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || (insn->off != 0 && insn->off != 1) || + (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { + verbose(env, "BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + } else if (insn->src_reg != BPF_REG_0 || + (insn->off != 0 && insn->off != 1) || + (insn->off == 1 && opcode != BPF_MOD && opcode != BPF_DIV)) { + verbose(env, "BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + return 0; + default: + verbose(env, "invalid BPF_ALU opcode %x\n", opcode); + return -EINVAL; + } +} + +static int check_jmp_fields(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + switch (opcode) { + case BPF_CALL: + if (BPF_SRC(insn->code) != BPF_K || + (insn->src_reg != BPF_PSEUDO_KFUNC_CALL && insn->off != 0) || + (insn->src_reg != BPF_REG_0 && insn->src_reg != BPF_PSEUDO_CALL && + insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || + insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { + verbose(env, "BPF_CALL uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_JA: + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->src_reg != BPF_REG_0 || insn->imm != 0 || insn->off != 0) { + verbose(env, "BPF_JA|BPF_X uses reserved fields\n"); + return -EINVAL; + } + } else if (insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || + (class == BPF_JMP && insn->imm != 0) || + (class == BPF_JMP32 && insn->off != 0)) { + verbose(env, "BPF_JA uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_EXIT: + if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || + insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { + verbose(env, "BPF_EXIT uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_JCOND: + if (insn->code != (BPF_JMP | BPF_JCOND) || insn->src_reg != BPF_MAY_GOTO || + insn->dst_reg || insn->imm) { + verbose(env, "invalid may_goto imm %d\n", insn->imm); + return -EINVAL; + } + return 0; + default: + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0) { + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); + return -EINVAL; + } + } else if (insn->src_reg != BPF_REG_0) { + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); + return -EINVAL; + } + return 0; + } +} + +static int check_insn_fields(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + switch (BPF_CLASS(insn->code)) { + case BPF_ALU: + case BPF_ALU64: + return check_alu_fields(env, insn); + case BPF_LDX: + if ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) || + insn->imm != 0) { + verbose(env, "BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_STX: + if (BPF_MODE(insn->code) == BPF_ATOMIC) + return 0; + if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_ST: + if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { + verbose(env, "BPF_ST uses reserved fields\n"); + return -EINVAL; + } + return 0; + case BPF_JMP: + case BPF_JMP32: + return check_jmp_fields(env, insn); + case BPF_LD: { + u8 mode = BPF_MODE(insn->code); + + if (mode == BPF_ABS || mode == BPF_IND) { + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + BPF_SIZE(insn->code) == BPF_DW || + (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { + verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n"); + return -EINVAL; + } + } else if (mode != BPF_IMM) { + verbose(env, "invalid BPF_LD mode\n"); + return -EINVAL; + } + return 0; + } + default: + verbose(env, "unknown insn class %d\n", BPF_CLASS(insn->code)); + return -EINVAL; + } +} + +/* + * Check that insns are sane and rewrite pseudo imm in ld_imm64 instructions: * * 1. if it accesses map FD, replace it with actual map pointer. * 2. if it accesses btf_id of a VAR, replace it with pointer to the var. * * NOTE: btf_vmlinux is required for converting pseudo btf_id. */ -static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) +static int check_and_resolve_insns(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -21703,13 +18424,14 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return err; for (i = 0; i < insn_cnt; i++, insn++) { - if (BPF_CLASS(insn->code) == BPF_LDX && - ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_MEMSX) || - insn->imm != 0)) { - verbose(env, "BPF_LDX uses reserved fields\n"); + if (insn->dst_reg >= MAX_BPF_REG) { + verbose(env, "R%d is invalid\n", insn->dst_reg); + return -EINVAL; + } + if (insn->src_reg >= MAX_BPF_REG) { + verbose(env, "R%d is invalid\n", insn->src_reg); return -EINVAL; } - if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_insn_aux_data *aux; struct bpf_map *map; @@ -21724,6 +18446,11 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -EINVAL; } + if (insn[0].off != 0) { + verbose(env, "BPF_LD_IMM64 uses reserved fields\n"); + return -EINVAL; + } + if (insn[0].src_reg == 0) /* valid generic load 64-bit imm */ goto next_insn; @@ -21820,6 +18547,10 @@ next_insn: verbose(env, "unknown opcode %02x\n", insn->code); return -EINVAL; } + + err = check_insn_fields(env, insn); + if (err) + return err; } /* now all pseudo BPF_LD_IMM64 instructions load valid @@ -21858,53 +18589,6 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) } } -/* single env->prog->insni[off] instruction was replaced with the range - * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying - * [0, off) and [off, end) to new locations, so the patched range stays zero - */ -static void adjust_insn_aux_data(struct bpf_verifier_env *env, - struct bpf_prog *new_prog, u32 off, u32 cnt) -{ - struct bpf_insn_aux_data *data = env->insn_aux_data; - struct bpf_insn *insn = new_prog->insnsi; - u32 old_seen = data[off].seen; - u32 prog_len; - int i; - - /* aux info at OFF always needs adjustment, no matter fast path - * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the - * original insn at old prog. - */ - data[off].zext_dst = insn_has_def32(insn + off + cnt - 1); - - if (cnt == 1) - return; - prog_len = new_prog->len; - - memmove(data + off + cnt - 1, data + off, - sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); - memset(data + off, 0, sizeof(struct bpf_insn_aux_data) * (cnt - 1)); - for (i = off; i < off + cnt - 1; i++) { - /* Expand insni[off]'s seen count to the patched range. */ - data[i].seen = old_seen; - data[i].zext_dst = insn_has_def32(insn + i); - } -} - -static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) -{ - int i; - - if (len == 1) - return; - /* NOTE: fake 'exit' subprog should be updated as well. */ - for (i = 0; i <= env->subprog_cnt; i++) { - if (env->subprog_info[i].start <= off) - continue; - env->subprog_info[i].start += len - 1; - } -} - static void release_insn_arrays(struct bpf_verifier_env *env) { int i; @@ -21913,281 +18597,7 @@ static void release_insn_arrays(struct bpf_verifier_env *env) bpf_insn_array_release(env->insn_array_maps[i]); } -static void adjust_insn_arrays(struct bpf_verifier_env *env, u32 off, u32 len) -{ - int i; - - if (len == 1) - return; - - for (i = 0; i < env->insn_array_map_cnt; i++) - bpf_insn_array_adjust(env->insn_array_maps[i], off, len); -} - -static void adjust_insn_arrays_after_remove(struct bpf_verifier_env *env, u32 off, u32 len) -{ - int i; - - for (i = 0; i < env->insn_array_map_cnt; i++) - bpf_insn_array_adjust_after_remove(env->insn_array_maps[i], off, len); -} - -static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len) -{ - struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; - int i, sz = prog->aux->size_poke_tab; - struct bpf_jit_poke_descriptor *desc; - - for (i = 0; i < sz; i++) { - desc = &tab[i]; - if (desc->insn_idx <= off) - continue; - desc->insn_idx += len - 1; - } -} - -static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, - const struct bpf_insn *patch, u32 len) -{ - struct bpf_prog *new_prog; - struct bpf_insn_aux_data *new_data = NULL; - - if (len > 1) { - new_data = vrealloc(env->insn_aux_data, - array_size(env->prog->len + len - 1, - sizeof(struct bpf_insn_aux_data)), - GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!new_data) - return NULL; - - env->insn_aux_data = new_data; - } - new_prog = bpf_patch_insn_single(env->prog, off, patch, len); - if (IS_ERR(new_prog)) { - if (PTR_ERR(new_prog) == -ERANGE) - verbose(env, - "insn %d cannot be patched due to 16-bit range\n", - env->insn_aux_data[off].orig_idx); - return NULL; - } - adjust_insn_aux_data(env, new_prog, off, len); - adjust_subprog_starts(env, off, len); - adjust_insn_arrays(env, off, len); - adjust_poke_descs(new_prog, off, len); - return new_prog; -} - -/* - * For all jmp insns in a given 'prog' that point to 'tgt_idx' insn adjust the - * jump offset by 'delta'. - */ -static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta) -{ - struct bpf_insn *insn = prog->insnsi; - u32 insn_cnt = prog->len, i; - s32 imm; - s16 off; - - for (i = 0; i < insn_cnt; i++, insn++) { - u8 code = insn->code; - - if (tgt_idx <= i && i < tgt_idx + delta) - continue; - - if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || - BPF_OP(code) == BPF_CALL || BPF_OP(code) == BPF_EXIT) - continue; - - if (insn->code == (BPF_JMP32 | BPF_JA)) { - if (i + 1 + insn->imm != tgt_idx) - continue; - if (check_add_overflow(insn->imm, delta, &imm)) - return -ERANGE; - insn->imm = imm; - } else { - if (i + 1 + insn->off != tgt_idx) - continue; - if (check_add_overflow(insn->off, delta, &off)) - return -ERANGE; - insn->off = off; - } - } - return 0; -} - -static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, - u32 off, u32 cnt) -{ - int i, j; - - /* find first prog starting at or after off (first to remove) */ - for (i = 0; i < env->subprog_cnt; i++) - if (env->subprog_info[i].start >= off) - break; - /* find first prog starting at or after off + cnt (first to stay) */ - for (j = i; j < env->subprog_cnt; j++) - if (env->subprog_info[j].start >= off + cnt) - break; - /* if j doesn't start exactly at off + cnt, we are just removing - * the front of previous prog - */ - if (env->subprog_info[j].start != off + cnt) - j--; - - if (j > i) { - struct bpf_prog_aux *aux = env->prog->aux; - int move; - - /* move fake 'exit' subprog as well */ - move = env->subprog_cnt + 1 - j; - - memmove(env->subprog_info + i, - env->subprog_info + j, - sizeof(*env->subprog_info) * move); - env->subprog_cnt -= j - i; - - /* remove func_info */ - if (aux->func_info) { - move = aux->func_info_cnt - j; - - memmove(aux->func_info + i, - aux->func_info + j, - sizeof(*aux->func_info) * move); - aux->func_info_cnt -= j - i; - /* func_info->insn_off is set after all code rewrites, - * in adjust_btf_func() - no need to adjust - */ - } - } else { - /* convert i from "first prog to remove" to "first to adjust" */ - if (env->subprog_info[i].start == off) - i++; - } - - /* update fake 'exit' subprog as well */ - for (; i <= env->subprog_cnt; i++) - env->subprog_info[i].start -= cnt; - - return 0; -} - -static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, - u32 cnt) -{ - struct bpf_prog *prog = env->prog; - u32 i, l_off, l_cnt, nr_linfo; - struct bpf_line_info *linfo; - - nr_linfo = prog->aux->nr_linfo; - if (!nr_linfo) - return 0; - - linfo = prog->aux->linfo; - - /* find first line info to remove, count lines to be removed */ - for (i = 0; i < nr_linfo; i++) - if (linfo[i].insn_off >= off) - break; - - l_off = i; - l_cnt = 0; - for (; i < nr_linfo; i++) - if (linfo[i].insn_off < off + cnt) - l_cnt++; - else - break; - - /* First live insn doesn't match first live linfo, it needs to "inherit" - * last removed linfo. prog is already modified, so prog->len == off - * means no live instructions after (tail of the program was removed). - */ - if (prog->len != off && l_cnt && - (i == nr_linfo || linfo[i].insn_off != off + cnt)) { - l_cnt--; - linfo[--i].insn_off = off + cnt; - } - - /* remove the line info which refer to the removed instructions */ - if (l_cnt) { - memmove(linfo + l_off, linfo + i, - sizeof(*linfo) * (nr_linfo - i)); - - prog->aux->nr_linfo -= l_cnt; - nr_linfo = prog->aux->nr_linfo; - } - - /* pull all linfo[i].insn_off >= off + cnt in by cnt */ - for (i = l_off; i < nr_linfo; i++) - linfo[i].insn_off -= cnt; - - /* fix up all subprogs (incl. 'exit') which start >= off */ - for (i = 0; i <= env->subprog_cnt; i++) - if (env->subprog_info[i].linfo_idx > l_off) { - /* program may have started in the removed region but - * may not be fully removed - */ - if (env->subprog_info[i].linfo_idx >= l_off + l_cnt) - env->subprog_info[i].linfo_idx -= l_cnt; - else - env->subprog_info[i].linfo_idx = l_off; - } - - return 0; -} - -/* - * Clean up dynamically allocated fields of aux data for instructions [start, ...] - */ -static void clear_insn_aux_data(struct bpf_verifier_env *env, int start, int len) -{ - struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - struct bpf_insn *insns = env->prog->insnsi; - int end = start + len; - int i; - - for (i = start; i < end; i++) { - if (aux_data[i].jt) { - kvfree(aux_data[i].jt); - aux_data[i].jt = NULL; - } - - if (bpf_is_ldimm64(&insns[i])) - i++; - } -} - -static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) -{ - struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - unsigned int orig_prog_len = env->prog->len; - int err; - - if (bpf_prog_is_offloaded(env->prog->aux)) - bpf_prog_offload_remove_insns(env, off, cnt); - - /* Should be called before bpf_remove_insns, as it uses prog->insnsi */ - clear_insn_aux_data(env, off, cnt); - - err = bpf_remove_insns(env->prog, off, cnt); - if (err) - return err; - - err = adjust_subprog_starts_after_remove(env, off, cnt); - if (err) - return err; - - err = bpf_adj_linfo_after_remove(env, off, cnt); - if (err) - return err; - - adjust_insn_arrays_after_remove(env, off, cnt); - - memmove(aux_data + off, aux_data + off + cnt, - sizeof(*aux_data) * (orig_prog_len - off - cnt)); - - return 0; -} /* The verifier does more data flow analysis than llvm and will not * explore branches that are dead at run time. Malicious programs can @@ -22216,2210 +18626,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) } } -static bool insn_is_cond_jump(u8 code) -{ - u8 op; - - op = BPF_OP(code); - if (BPF_CLASS(code) == BPF_JMP32) - return op != BPF_JA; - - if (BPF_CLASS(code) != BPF_JMP) - return false; - - return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; -} - -static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) -{ - struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); - struct bpf_insn *insn = env->prog->insnsi; - const int insn_cnt = env->prog->len; - int i; - - for (i = 0; i < insn_cnt; i++, insn++) { - if (!insn_is_cond_jump(insn->code)) - continue; - - if (!aux_data[i + 1].seen) - ja.off = insn->off; - else if (!aux_data[i + 1 + insn->off].seen) - ja.off = 0; - else - continue; - - if (bpf_prog_is_offloaded(env->prog->aux)) - bpf_prog_offload_replace_insn(env, i, &ja); - - memcpy(insn, &ja, sizeof(ja)); - } -} - -static int opt_remove_dead_code(struct bpf_verifier_env *env) -{ - struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - int insn_cnt = env->prog->len; - int i, err; - - for (i = 0; i < insn_cnt; i++) { - int j; - - j = 0; - while (i + j < insn_cnt && !aux_data[i + j].seen) - j++; - if (!j) - continue; - - err = verifier_remove_insns(env, i, j); - if (err) - return err; - insn_cnt = env->prog->len; - } - - return 0; -} - -static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0); -static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0); - -static int opt_remove_nops(struct bpf_verifier_env *env) -{ - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - bool is_may_goto_0, is_ja; - int i, err; - - for (i = 0; i < insn_cnt; i++) { - is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0)); - is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP)); - - if (!is_may_goto_0 && !is_ja) - continue; - - err = verifier_remove_insns(env, i, 1); - if (err) - return err; - insn_cnt--; - /* Go back one insn to catch may_goto +1; may_goto +0 sequence */ - i -= (is_may_goto_0 && i > 0) ? 2 : 1; - } - - return 0; -} - -static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, - const union bpf_attr *attr) -{ - struct bpf_insn *patch; - /* use env->insn_buf as two independent buffers */ - struct bpf_insn *zext_patch = env->insn_buf; - struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2]; - struct bpf_insn_aux_data *aux = env->insn_aux_data; - int i, patch_len, delta = 0, len = env->prog->len; - struct bpf_insn *insns = env->prog->insnsi; - struct bpf_prog *new_prog; - bool rnd_hi32; - - rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32; - zext_patch[1] = BPF_ZEXT_REG(0); - rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0); - rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); - rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX); - for (i = 0; i < len; i++) { - int adj_idx = i + delta; - struct bpf_insn insn; - int load_reg; - - insn = insns[adj_idx]; - load_reg = insn_def_regno(&insn); - if (!aux[adj_idx].zext_dst) { - u8 code, class; - u32 imm_rnd; - - if (!rnd_hi32) - continue; - - code = insn.code; - class = BPF_CLASS(code); - if (load_reg == -1) - continue; - - /* NOTE: arg "reg" (the fourth one) is only used for - * BPF_STX + SRC_OP, so it is safe to pass NULL - * here. - */ - if (is_reg64(&insn, load_reg, NULL, DST_OP)) { - if (class == BPF_LD && - BPF_MODE(code) == BPF_IMM) - i++; - continue; - } - - /* ctx load could be transformed into wider load. */ - if (class == BPF_LDX && - aux[adj_idx].ptr_type == PTR_TO_CTX) - continue; - - imm_rnd = get_random_u32(); - rnd_hi32_patch[0] = insn; - rnd_hi32_patch[1].imm = imm_rnd; - rnd_hi32_patch[3].dst_reg = load_reg; - patch = rnd_hi32_patch; - patch_len = 4; - goto apply_patch_buffer; - } - - /* Add in an zero-extend instruction if a) the JIT has requested - * it or b) it's a CMPXCHG. - * - * The latter is because: BPF_CMPXCHG always loads a value into - * R0, therefore always zero-extends. However some archs' - * equivalent instruction only does this load when the - * comparison is successful. This detail of CMPXCHG is - * orthogonal to the general zero-extension behaviour of the - * CPU, so it's treated independently of bpf_jit_needs_zext. - */ - if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn)) - continue; - - /* Zero-extension is done by the caller. */ - if (bpf_pseudo_kfunc_call(&insn)) - continue; - - if (verifier_bug_if(load_reg == -1, env, - "zext_dst is set, but no reg is defined")) - return -EFAULT; - - zext_patch[0] = insn; - zext_patch[1].dst_reg = load_reg; - zext_patch[1].src_reg = load_reg; - patch = zext_patch; - patch_len = 2; -apply_patch_buffer: - new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len); - if (!new_prog) - return -ENOMEM; - env->prog = new_prog; - insns = new_prog->insnsi; - aux = env->insn_aux_data; - delta += patch_len - 1; - } - - return 0; -} - -/* convert load instructions that access fields of a context type into a - * sequence of instructions that access fields of the underlying structure: - * struct __sk_buff -> struct sk_buff - * struct bpf_sock_ops -> struct sock - */ -static int convert_ctx_accesses(struct bpf_verifier_env *env) -{ - struct bpf_subprog_info *subprogs = env->subprog_info; - const struct bpf_verifier_ops *ops = env->ops; - int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0; - const int insn_cnt = env->prog->len; - struct bpf_insn *epilogue_buf = env->epilogue_buf; - struct bpf_insn *insn_buf = env->insn_buf; - struct bpf_insn *insn; - u32 target_size, size_default, off; - struct bpf_prog *new_prog; - enum bpf_access_type type; - bool is_narrower_load; - int epilogue_idx = 0; - - if (ops->gen_epilogue) { - epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, - -(subprogs[0].stack_depth + 8)); - if (epilogue_cnt >= INSN_BUF_SIZE) { - verifier_bug(env, "epilogue is too long"); - return -EFAULT; - } else if (epilogue_cnt) { - /* Save the ARG_PTR_TO_CTX for the epilogue to use */ - cnt = 0; - subprogs[0].stack_depth += 8; - insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, - -subprogs[0].stack_depth); - insn_buf[cnt++] = env->prog->insnsi[0]; - new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - env->prog = new_prog; - delta += cnt - 1; - - ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1); - if (ret < 0) - return ret; - } - } - - if (ops->gen_prologue || env->seen_direct_write) { - if (!ops->gen_prologue) { - verifier_bug(env, "gen_prologue is null"); - return -EFAULT; - } - cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, - env->prog); - if (cnt >= INSN_BUF_SIZE) { - verifier_bug(env, "prologue is too long"); - return -EFAULT; - } else if (cnt) { - new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - env->prog = new_prog; - delta += cnt - 1; - - ret = add_kfunc_in_insns(env, insn_buf, cnt - 1); - if (ret < 0) - return ret; - } - } - - if (delta) - WARN_ON(adjust_jmp_off(env->prog, 0, delta)); - - if (bpf_prog_is_offloaded(env->prog->aux)) - return 0; - - insn = env->prog->insnsi + delta; - - for (i = 0; i < insn_cnt; i++, insn++) { - bpf_convert_ctx_access_t convert_ctx_access; - u8 mode; - - if (env->insn_aux_data[i + delta].nospec) { - WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state); - struct bpf_insn *patch = insn_buf; - - *patch++ = BPF_ST_NOSPEC(); - *patch++ = *insn; - cnt = patch - insn_buf; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = new_prog; - insn = new_prog->insnsi + i + delta; - /* This can not be easily merged with the - * nospec_result-case, because an insn may require a - * nospec before and after itself. Therefore also do not - * 'continue' here but potentially apply further - * patching to insn. *insn should equal patch[1] now. - */ - } - - if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || - insn->code == (BPF_LDX | BPF_MEM | BPF_H) || - insn->code == (BPF_LDX | BPF_MEM | BPF_W) || - insn->code == (BPF_LDX | BPF_MEM | BPF_DW) || - insn->code == (BPF_LDX | BPF_MEMSX | BPF_B) || - insn->code == (BPF_LDX | BPF_MEMSX | BPF_H) || - insn->code == (BPF_LDX | BPF_MEMSX | BPF_W)) { - type = BPF_READ; - } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || - insn->code == (BPF_STX | BPF_MEM | BPF_H) || - insn->code == (BPF_STX | BPF_MEM | BPF_W) || - insn->code == (BPF_STX | BPF_MEM | BPF_DW) || - insn->code == (BPF_ST | BPF_MEM | BPF_B) || - insn->code == (BPF_ST | BPF_MEM | BPF_H) || - insn->code == (BPF_ST | BPF_MEM | BPF_W) || - insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { - type = BPF_WRITE; - } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) || - insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) || - insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || - insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && - env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { - insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); - env->prog->aux->num_exentries++; - continue; - } else if (insn->code == (BPF_JMP | BPF_EXIT) && - epilogue_cnt && - i + delta < subprogs[1].start) { - /* Generate epilogue for the main prog */ - if (epilogue_idx) { - /* jump back to the earlier generated epilogue */ - insn_buf[0] = BPF_JMP32_A(epilogue_idx - i - delta - 1); - cnt = 1; - } else { - memcpy(insn_buf, epilogue_buf, - epilogue_cnt * sizeof(*epilogue_buf)); - cnt = epilogue_cnt; - /* epilogue_idx cannot be 0. It must have at - * least one ctx ptr saving insn before the - * epilogue. - */ - epilogue_idx = i + delta; - } - goto patch_insn_buf; - } else { - continue; - } - - if (type == BPF_WRITE && - env->insn_aux_data[i + delta].nospec_result) { - /* nospec_result is only used to mitigate Spectre v4 and - * to limit verification-time for Spectre v1. - */ - struct bpf_insn *patch = insn_buf; - - *patch++ = *insn; - *patch++ = BPF_ST_NOSPEC(); - cnt = patch - insn_buf; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = new_prog; - insn = new_prog->insnsi + i + delta; - continue; - } - - switch ((int)env->insn_aux_data[i + delta].ptr_type) { - case PTR_TO_CTX: - if (!ops->convert_ctx_access) - continue; - convert_ctx_access = ops->convert_ctx_access; - break; - case PTR_TO_SOCKET: - case PTR_TO_SOCK_COMMON: - convert_ctx_access = bpf_sock_convert_ctx_access; - break; - case PTR_TO_TCP_SOCK: - convert_ctx_access = bpf_tcp_sock_convert_ctx_access; - break; - case PTR_TO_XDP_SOCK: - convert_ctx_access = bpf_xdp_sock_convert_ctx_access; - break; - case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID | PTR_UNTRUSTED: - /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike - * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot - * be said once it is marked PTR_UNTRUSTED, hence we must handle - * any faults for loads into such types. BPF_WRITE is disallowed - * for this case. - */ - case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: - case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED: - if (type == BPF_READ) { - if (BPF_MODE(insn->code) == BPF_MEM) - insn->code = BPF_LDX | BPF_PROBE_MEM | - BPF_SIZE((insn)->code); - else - insn->code = BPF_LDX | BPF_PROBE_MEMSX | - BPF_SIZE((insn)->code); - env->prog->aux->num_exentries++; - } - continue; - case PTR_TO_ARENA: - if (BPF_MODE(insn->code) == BPF_MEMSX) { - if (!bpf_jit_supports_insn(insn, true)) { - verbose(env, "sign extending loads from arena are not supported yet\n"); - return -EOPNOTSUPP; - } - insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code); - } else { - insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); - } - env->prog->aux->num_exentries++; - continue; - default: - continue; - } - - ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; - size = BPF_LDST_BYTES(insn); - mode = BPF_MODE(insn->code); - - /* If the read access is a narrower load of the field, - * convert to a 4/8-byte load, to minimum program type specific - * convert_ctx_access changes. If conversion is successful, - * we will apply proper mask to the result. - */ - is_narrower_load = size < ctx_field_size; - size_default = bpf_ctx_off_adjust_machine(ctx_field_size); - off = insn->off; - if (is_narrower_load) { - u8 size_code; - - if (type == BPF_WRITE) { - verifier_bug(env, "narrow ctx access misconfigured"); - return -EFAULT; - } - - size_code = BPF_H; - if (ctx_field_size == 4) - size_code = BPF_W; - else if (ctx_field_size == 8) - size_code = BPF_DW; - - insn->off = off & ~(size_default - 1); - insn->code = BPF_LDX | BPF_MEM | size_code; - } - - target_size = 0; - cnt = convert_ctx_access(type, insn, insn_buf, env->prog, - &target_size); - if (cnt == 0 || cnt >= INSN_BUF_SIZE || - (ctx_field_size && !target_size)) { - verifier_bug(env, "error during ctx access conversion (%d)", cnt); - return -EFAULT; - } - - if (is_narrower_load && size < target_size) { - u8 shift = bpf_ctx_narrow_access_offset( - off, size, size_default) * 8; - if (shift && cnt + 1 >= INSN_BUF_SIZE) { - verifier_bug(env, "narrow ctx load misconfigured"); - return -EFAULT; - } - if (ctx_field_size <= 4) { - if (shift) - insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, - insn->dst_reg, - shift); - insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); - } else { - if (shift) - insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, - insn->dst_reg, - shift); - insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, - (1ULL << size * 8) - 1); - } - } - if (mode == BPF_MEMSX) - insn_buf[cnt++] = BPF_RAW_INSN(BPF_ALU64 | BPF_MOV | BPF_X, - insn->dst_reg, insn->dst_reg, - size * 8, 0); - -patch_insn_buf: - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - - /* keep walking new program and skip insns we just inserted */ - env->prog = new_prog; - insn = new_prog->insnsi + i + delta; - } - - return 0; -} - -static int jit_subprogs(struct bpf_verifier_env *env) -{ - struct bpf_prog *prog = env->prog, **func, *tmp; - int i, j, subprog_start, subprog_end = 0, len, subprog; - struct bpf_map *map_ptr; - struct bpf_insn *insn; - void *old_bpf_func; - int err, num_exentries; - int old_len, subprog_start_adjustment = 0; - - if (env->subprog_cnt <= 1) - return 0; - - for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn)) - continue; - - /* Upon error here we cannot fall back to interpreter but - * need a hard reject of the program. Thus -EFAULT is - * propagated in any case. - */ - subprog = find_subprog(env, i + insn->imm + 1); - if (verifier_bug_if(subprog < 0, env, "No program to jit at insn %d", - i + insn->imm + 1)) - return -EFAULT; - /* temporarily remember subprog id inside insn instead of - * aux_data, since next loop will split up all insns into funcs - */ - insn->off = subprog; - /* remember original imm in case JIT fails and fallback - * to interpreter will be needed - */ - env->insn_aux_data[i].call_imm = insn->imm; - /* point imm to __bpf_call_base+1 from JITs point of view */ - insn->imm = 1; - if (bpf_pseudo_func(insn)) { -#if defined(MODULES_VADDR) - u64 addr = MODULES_VADDR; -#else - u64 addr = VMALLOC_START; -#endif - /* jit (e.g. x86_64) may emit fewer instructions - * if it learns a u32 imm is the same as a u64 imm. - * Set close enough to possible prog address. - */ - insn[0].imm = (u32)addr; - insn[1].imm = addr >> 32; - } - } - - err = bpf_prog_alloc_jited_linfo(prog); - if (err) - goto out_undo_insn; - - err = -ENOMEM; - func = kzalloc_objs(prog, env->subprog_cnt); - if (!func) - goto out_undo_insn; - - for (i = 0; i < env->subprog_cnt; i++) { - subprog_start = subprog_end; - subprog_end = env->subprog_info[i + 1].start; - - len = subprog_end - subprog_start; - /* bpf_prog_run() doesn't call subprogs directly, - * hence main prog stats include the runtime of subprogs. - * subprogs don't have IDs and not reachable via prog_get_next_id - * func[i]->stats will never be accessed and stays NULL - */ - func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); - if (!func[i]) - goto out_free; - memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], - len * sizeof(struct bpf_insn)); - func[i]->type = prog->type; - func[i]->len = len; - if (bpf_prog_calc_tag(func[i])) - goto out_free; - func[i]->is_func = 1; - func[i]->sleepable = prog->sleepable; - func[i]->aux->func_idx = i; - /* Below members will be freed only at prog->aux */ - func[i]->aux->btf = prog->aux->btf; - func[i]->aux->subprog_start = subprog_start + subprog_start_adjustment; - func[i]->aux->func_info = prog->aux->func_info; - func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; - func[i]->aux->poke_tab = prog->aux->poke_tab; - func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; - func[i]->aux->main_prog_aux = prog->aux; - - for (j = 0; j < prog->aux->size_poke_tab; j++) { - struct bpf_jit_poke_descriptor *poke; - - poke = &prog->aux->poke_tab[j]; - if (poke->insn_idx < subprog_end && - poke->insn_idx >= subprog_start) - poke->aux = func[i]->aux; - } - - func[i]->aux->name[0] = 'F'; - func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; - if (env->subprog_info[i].priv_stack_mode == PRIV_STACK_ADAPTIVE) - func[i]->aux->jits_use_priv_stack = true; - - func[i]->jit_requested = 1; - func[i]->blinding_requested = prog->blinding_requested; - func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; - func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab; - func[i]->aux->linfo = prog->aux->linfo; - func[i]->aux->nr_linfo = prog->aux->nr_linfo; - func[i]->aux->jited_linfo = prog->aux->jited_linfo; - func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; - func[i]->aux->arena = prog->aux->arena; - func[i]->aux->used_maps = env->used_maps; - func[i]->aux->used_map_cnt = env->used_map_cnt; - num_exentries = 0; - insn = func[i]->insnsi; - for (j = 0; j < func[i]->len; j++, insn++) { - if (BPF_CLASS(insn->code) == BPF_LDX && - (BPF_MODE(insn->code) == BPF_PROBE_MEM || - BPF_MODE(insn->code) == BPF_PROBE_MEM32 || - BPF_MODE(insn->code) == BPF_PROBE_MEM32SX || - BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) - num_exentries++; - if ((BPF_CLASS(insn->code) == BPF_STX || - BPF_CLASS(insn->code) == BPF_ST) && - BPF_MODE(insn->code) == BPF_PROBE_MEM32) - num_exentries++; - if (BPF_CLASS(insn->code) == BPF_STX && - BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) - num_exentries++; - } - func[i]->aux->num_exentries = num_exentries; - func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; - func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; - func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; - func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; - if (!i) - func[i]->aux->exception_boundary = env->seen_exception; - - /* - * To properly pass the absolute subprog start to jit - * all instruction adjustments should be accumulated - */ - old_len = func[i]->len; - func[i] = bpf_int_jit_compile(func[i]); - subprog_start_adjustment += func[i]->len - old_len; - - if (!func[i]->jited) { - err = -ENOTSUPP; - goto out_free; - } - cond_resched(); - } - - /* at this point all bpf functions were successfully JITed - * now populate all bpf_calls with correct addresses and - * run last pass of JIT - */ - for (i = 0; i < env->subprog_cnt; i++) { - insn = func[i]->insnsi; - for (j = 0; j < func[i]->len; j++, insn++) { - if (bpf_pseudo_func(insn)) { - subprog = insn->off; - insn[0].imm = (u32)(long)func[subprog]->bpf_func; - insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32; - continue; - } - if (!bpf_pseudo_call(insn)) - continue; - subprog = insn->off; - insn->imm = BPF_CALL_IMM(func[subprog]->bpf_func); - } - - /* we use the aux data to keep a list of the start addresses - * of the JITed images for each function in the program - * - * for some architectures, such as powerpc64, the imm field - * might not be large enough to hold the offset of the start - * address of the callee's JITed image from __bpf_call_base - * - * in such cases, we can lookup the start address of a callee - * by using its subprog id, available from the off field of - * the call instruction, as an index for this list - */ - func[i]->aux->func = func; - func[i]->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; - func[i]->aux->real_func_cnt = env->subprog_cnt; - } - for (i = 0; i < env->subprog_cnt; i++) { - old_bpf_func = func[i]->bpf_func; - tmp = bpf_int_jit_compile(func[i]); - if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { - verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); - err = -ENOTSUPP; - goto out_free; - } - cond_resched(); - } - - /* - * Cleanup func[i]->aux fields which aren't required - * or can become invalid in future - */ - for (i = 0; i < env->subprog_cnt; i++) { - func[i]->aux->used_maps = NULL; - func[i]->aux->used_map_cnt = 0; - } - - /* finally lock prog and jit images for all functions and - * populate kallsysm. Begin at the first subprogram, since - * bpf_prog_load will add the kallsyms for the main program. - */ - for (i = 1; i < env->subprog_cnt; i++) { - err = bpf_prog_lock_ro(func[i]); - if (err) - goto out_free; - } - - for (i = 1; i < env->subprog_cnt; i++) - bpf_prog_kallsyms_add(func[i]); - - /* Last step: make now unused interpreter insns from main - * prog consistent for later dump requests, so they can - * later look the same as if they were interpreted only. - */ - for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (bpf_pseudo_func(insn)) { - insn[0].imm = env->insn_aux_data[i].call_imm; - insn[1].imm = insn->off; - insn->off = 0; - continue; - } - if (!bpf_pseudo_call(insn)) - continue; - insn->off = env->insn_aux_data[i].call_imm; - subprog = find_subprog(env, i + insn->off + 1); - insn->imm = subprog; - } - - prog->jited = 1; - prog->bpf_func = func[0]->bpf_func; - prog->jited_len = func[0]->jited_len; - prog->aux->extable = func[0]->aux->extable; - prog->aux->num_exentries = func[0]->aux->num_exentries; - prog->aux->func = func; - prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; - prog->aux->real_func_cnt = env->subprog_cnt; - prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func; - prog->aux->exception_boundary = func[0]->aux->exception_boundary; - bpf_prog_jit_attempt_done(prog); - return 0; -out_free: - /* We failed JIT'ing, so at this point we need to unregister poke - * descriptors from subprogs, so that kernel is not attempting to - * patch it anymore as we're freeing the subprog JIT memory. - */ - for (i = 0; i < prog->aux->size_poke_tab; i++) { - map_ptr = prog->aux->poke_tab[i].tail_call.map; - map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); - } - /* At this point we're guaranteed that poke descriptors are not - * live anymore. We can just unlink its descriptor table as it's - * released with the main prog. - */ - for (i = 0; i < env->subprog_cnt; i++) { - if (!func[i]) - continue; - func[i]->aux->poke_tab = NULL; - bpf_jit_free(func[i]); - } - kfree(func); -out_undo_insn: - /* cleanup main prog to be interpreted */ - prog->jit_requested = 0; - prog->blinding_requested = 0; - for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (!bpf_pseudo_call(insn)) - continue; - insn->off = 0; - insn->imm = env->insn_aux_data[i].call_imm; - } - bpf_prog_jit_attempt_done(prog); - return err; -} - -static int fixup_call_args(struct bpf_verifier_env *env) -{ -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - struct bpf_prog *prog = env->prog; - struct bpf_insn *insn = prog->insnsi; - bool has_kfunc_call = bpf_prog_has_kfunc_call(prog); - int i, depth; -#endif - int err = 0; - - if (env->prog->jit_requested && - !bpf_prog_is_offloaded(env->prog->aux)) { - err = jit_subprogs(env); - if (err == 0) - return 0; - if (err == -EFAULT) - return err; - } -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - if (has_kfunc_call) { - verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); - return -EINVAL; - } - if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { - /* When JIT fails the progs with bpf2bpf calls and tail_calls - * have to be rejected, since interpreter doesn't support them yet. - */ - verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); - return -EINVAL; - } - for (i = 0; i < prog->len; i++, insn++) { - if (bpf_pseudo_func(insn)) { - /* When JIT fails the progs with callback calls - * have to be rejected, since interpreter doesn't support them yet. - */ - verbose(env, "callbacks are not allowed in non-JITed programs\n"); - return -EINVAL; - } - - if (!bpf_pseudo_call(insn)) - continue; - depth = get_callee_stack_depth(env, insn, i); - if (depth < 0) - return depth; - bpf_patch_call_args(insn, depth); - } - err = 0; -#endif - return err; -} - -/* replace a generic kfunc with a specialized version if necessary */ -static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx) -{ - struct bpf_prog *prog = env->prog; - bool seen_direct_write; - void *xdp_kfunc; - bool is_rdonly; - u32 func_id = desc->func_id; - u16 offset = desc->offset; - unsigned long addr = desc->addr; - - if (offset) /* return if module BTF is used */ - return 0; - - if (bpf_dev_bound_kfunc_id(func_id)) { - xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id); - if (xdp_kfunc) - addr = (unsigned long)xdp_kfunc; - /* fallback to default kfunc when not supported by netdev */ - } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { - seen_direct_write = env->seen_direct_write; - is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); - - if (is_rdonly) - addr = (unsigned long)bpf_dynptr_from_skb_rdonly; - - /* restore env->seen_direct_write to its original value, since - * may_access_direct_pkt_data mutates it - */ - env->seen_direct_write = seen_direct_write; - } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) { - if (bpf_lsm_has_d_inode_locked(prog)) - addr = (unsigned long)bpf_set_dentry_xattr_locked; - } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) { - if (bpf_lsm_has_d_inode_locked(prog)) - addr = (unsigned long)bpf_remove_dentry_xattr_locked; - } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { - if (!env->insn_aux_data[insn_idx].non_sleepable) - addr = (unsigned long)bpf_dynptr_from_file_sleepable; - } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) { - if (env->insn_aux_data[insn_idx].non_sleepable) - addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable; - } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) { - if (env->insn_aux_data[insn_idx].non_sleepable) - addr = (unsigned long)bpf_arena_free_pages_non_sleepable; - } - desc->addr = addr; - return 0; -} - -static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, - u16 struct_meta_reg, - u16 node_offset_reg, - struct bpf_insn *insn, - struct bpf_insn *insn_buf, - int *cnt) -{ - struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta; - struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) }; - - insn_buf[0] = addr[0]; - insn_buf[1] = addr[1]; - insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off); - insn_buf[3] = *insn; - *cnt = 4; -} - -static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - struct bpf_insn *insn_buf, int insn_idx, int *cnt) -{ - struct bpf_kfunc_desc *desc; - int err; - - if (!insn->imm) { - verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); - return -EINVAL; - } - - *cnt = 0; - - /* insn->imm has the btf func_id. Replace it with an offset relative to - * __bpf_call_base, unless the JIT needs to call functions that are - * further than 32 bits away (bpf_jit_supports_far_kfunc_call()). - */ - desc = find_kfunc_desc(env->prog, insn->imm, insn->off); - if (!desc) { - verifier_bug(env, "kernel function descriptor not found for func_id %u", - insn->imm); - return -EFAULT; - } - - err = specialize_kfunc(env, desc, insn_idx); - if (err) - return err; - - if (!bpf_jit_supports_far_kfunc_call()) - insn->imm = BPF_CALL_IMM(desc->addr); - - if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] || - desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) { - struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; - struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; - u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; - - if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) { - verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d", - insn_idx); - return -EFAULT; - } - - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size); - insn_buf[1] = addr[0]; - insn_buf[2] = addr[1]; - insn_buf[3] = *insn; - *cnt = 4; - } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] || - desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] || - desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) { - struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; - struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; - - if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) { - verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d", - insn_idx); - return -EFAULT; - } - - if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && - !kptr_struct_meta) { - verifier_bug(env, "kptr_struct_meta expected at insn_idx %d", - insn_idx); - return -EFAULT; - } - - insn_buf[0] = addr[0]; - insn_buf[1] = addr[1]; - insn_buf[2] = *insn; - *cnt = 3; - } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || - desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || - desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { - struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; - int struct_meta_reg = BPF_REG_3; - int node_offset_reg = BPF_REG_4; - - /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */ - if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { - struct_meta_reg = BPF_REG_4; - node_offset_reg = BPF_REG_5; - } - - if (!kptr_struct_meta) { - verifier_bug(env, "kptr_struct_meta expected at insn_idx %d", - insn_idx); - return -EFAULT; - } - - __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg, - node_offset_reg, insn, insn_buf, cnt); - } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || - desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { - insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); - *cnt = 1; - } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] && - env->prog->expected_attach_type == BPF_TRACE_FSESSION) { - /* - * inline the bpf_session_is_return() for fsession: - * bool bpf_session_is_return(void *ctx) - * { - * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1; - * } - */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); - insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT); - insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); - *cnt = 3; - } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] && - env->prog->expected_attach_type == BPF_TRACE_FSESSION) { - /* - * inline bpf_session_cookie() for fsession: - * __u64 *bpf_session_cookie(void *ctx) - * { - * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF; - * return &((u64 *)ctx)[-off]; - * } - */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); - insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT); - insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); - insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); - insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1); - insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0); - *cnt = 6; - } - - if (env->insn_aux_data[insn_idx].arg_prog) { - u32 regno = env->insn_aux_data[insn_idx].arg_prog; - struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) }; - int idx = *cnt; - - insn_buf[idx++] = ld_addrs[0]; - insn_buf[idx++] = ld_addrs[1]; - insn_buf[idx++] = *insn; - *cnt = idx; - } - return 0; -} - -/* The function requires that first instruction in 'patch' is insnsi[prog->len - 1] */ -static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *patch, int len) -{ - struct bpf_subprog_info *info = env->subprog_info; - int cnt = env->subprog_cnt; - struct bpf_prog *prog; - - /* We only reserve one slot for hidden subprogs in subprog_info. */ - if (env->hidden_subprog_cnt) { - verifier_bug(env, "only one hidden subprog supported"); - return -EFAULT; - } - /* We're not patching any existing instruction, just appending the new - * ones for the hidden subprog. Hence all of the adjustment operations - * in bpf_patch_insn_data are no-ops. - */ - prog = bpf_patch_insn_data(env, env->prog->len - 1, patch, len); - if (!prog) - return -ENOMEM; - env->prog = prog; - info[cnt + 1].start = info[cnt].start; - info[cnt].start = prog->len - len + 1; - env->subprog_cnt++; - env->hidden_subprog_cnt++; - return 0; -} - -/* Do various post-verification rewrites in a single program pass. - * These rewrites simplify JIT and interpreter implementations. - */ -static int do_misc_fixups(struct bpf_verifier_env *env) -{ - struct bpf_prog *prog = env->prog; - enum bpf_attach_type eatype = prog->expected_attach_type; - enum bpf_prog_type prog_type = resolve_prog_type(prog); - struct bpf_insn *insn = prog->insnsi; - const struct bpf_func_proto *fn; - const int insn_cnt = prog->len; - const struct bpf_map_ops *ops; - struct bpf_insn_aux_data *aux; - struct bpf_insn *insn_buf = env->insn_buf; - struct bpf_prog *new_prog; - struct bpf_map *map_ptr; - int i, ret, cnt, delta = 0, cur_subprog = 0; - struct bpf_subprog_info *subprogs = env->subprog_info; - u16 stack_depth = subprogs[cur_subprog].stack_depth; - u16 stack_depth_extra = 0; - - if (env->seen_exception && !env->exception_callback_subprog) { - struct bpf_insn *patch = insn_buf; - - *patch++ = env->prog->insnsi[insn_cnt - 1]; - *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); - *patch++ = BPF_EXIT_INSN(); - ret = add_hidden_subprog(env, insn_buf, patch - insn_buf); - if (ret < 0) - return ret; - prog = env->prog; - insn = prog->insnsi; - - env->exception_callback_subprog = env->subprog_cnt - 1; - /* Don't update insn_cnt, as add_hidden_subprog always appends insns */ - mark_subprog_exc_cb(env, env->exception_callback_subprog); - } - - for (i = 0; i < insn_cnt;) { - if (insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->imm) { - if ((insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1) || - (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) { - /* convert to 32-bit mov that clears upper 32-bit */ - insn->code = BPF_ALU | BPF_MOV | BPF_X; - /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */ - insn->off = 0; - insn->imm = 0; - } /* cast from as(0) to as(1) should be handled by JIT */ - goto next_insn; - } - - if (env->insn_aux_data[i + delta].needs_zext) - /* Convert BPF_CLASS(insn->code) == BPF_ALU64 to 32-bit ALU */ - insn->code = BPF_ALU | BPF_OP(insn->code) | BPF_SRC(insn->code); - - /* Make sdiv/smod divide-by-minus-one exceptions impossible. */ - if ((insn->code == (BPF_ALU64 | BPF_MOD | BPF_K) || - insn->code == (BPF_ALU64 | BPF_DIV | BPF_K) || - insn->code == (BPF_ALU | BPF_MOD | BPF_K) || - insn->code == (BPF_ALU | BPF_DIV | BPF_K)) && - insn->off == 1 && insn->imm == -1) { - bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - bool isdiv = BPF_OP(insn->code) == BPF_DIV; - struct bpf_insn *patch = insn_buf; - - if (isdiv) - *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_NEG | BPF_K, insn->dst_reg, - 0, 0, 0); - else - *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0); - - cnt = patch - insn_buf; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Make divide-by-zero and divide-by-minus-one exceptions impossible. */ - if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || - insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || - insn->code == (BPF_ALU | BPF_MOD | BPF_X) || - insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - bool isdiv = BPF_OP(insn->code) == BPF_DIV; - bool is_sdiv = isdiv && insn->off == 1; - bool is_smod = !isdiv && insn->off == 1; - struct bpf_insn *patch = insn_buf; - - if (is_sdiv) { - /* [R,W]x sdiv 0 -> 0 - * LLONG_MIN sdiv -1 -> LLONG_MIN - * INT_MIN sdiv -1 -> INT_MIN - */ - *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); - *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_ADD | BPF_K, BPF_REG_AX, - 0, 0, 1); - *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JGT | BPF_K, BPF_REG_AX, - 0, 4, 1); - *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, BPF_REG_AX, - 0, 1, 0); - *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_MOV | BPF_K, insn->dst_reg, - 0, 0, 0); - /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */ - *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_NEG | BPF_K, insn->dst_reg, - 0, 0, 0); - *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *patch++ = *insn; - cnt = patch - insn_buf; - } else if (is_smod) { - /* [R,W]x mod 0 -> [R,W]x */ - /* [R,W]x mod -1 -> 0 */ - *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); - *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_ADD | BPF_K, BPF_REG_AX, - 0, 0, 1); - *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JGT | BPF_K, BPF_REG_AX, - 0, 3, 1); - *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, BPF_REG_AX, - 0, 3 + (is64 ? 0 : 1), 1); - *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0); - *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *patch++ = *insn; - - if (!is64) { - *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg); - } - cnt = patch - insn_buf; - } else if (isdiv) { - /* [R,W]x div 0 -> 0 */ - *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JNE | BPF_K, insn->src_reg, - 0, 2, 0); - *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg); - *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *patch++ = *insn; - cnt = patch - insn_buf; - } else { - /* [R,W]x mod 0 -> [R,W]x */ - *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, insn->src_reg, - 0, 1 + (is64 ? 0 : 1), 0); - *patch++ = *insn; - - if (!is64) { - *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg); - } - cnt = patch - insn_buf; - } - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Make it impossible to de-reference a userspace address */ - if (BPF_CLASS(insn->code) == BPF_LDX && - (BPF_MODE(insn->code) == BPF_PROBE_MEM || - BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) { - struct bpf_insn *patch = insn_buf; - u64 uaddress_limit = bpf_arch_uaddress_limit(); - - if (!uaddress_limit) - goto next_insn; - - *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); - if (insn->off) - *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off); - *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32); - *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2); - *patch++ = *insn; - *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); - *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0); - - cnt = patch - insn_buf; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */ - if (BPF_CLASS(insn->code) == BPF_LD && - (BPF_MODE(insn->code) == BPF_ABS || - BPF_MODE(insn->code) == BPF_IND)) { - cnt = env->ops->gen_ld_abs(insn, insn_buf); - if (cnt == 0 || cnt >= INSN_BUF_SIZE) { - verifier_bug(env, "%d insns generated for ld_abs", cnt); - return -EFAULT; - } - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Rewrite pointer arithmetic to mitigate speculation attacks. */ - if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || - insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { - const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; - const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; - struct bpf_insn *patch = insn_buf; - bool issrc, isneg, isimm; - u32 off_reg; - - aux = &env->insn_aux_data[i + delta]; - if (!aux->alu_state || - aux->alu_state == BPF_ALU_NON_POINTER) - goto next_insn; - - isneg = aux->alu_state & BPF_ALU_NEG_VALUE; - issrc = (aux->alu_state & BPF_ALU_SANITIZE) == - BPF_ALU_SANITIZE_SRC; - isimm = aux->alu_state & BPF_ALU_IMMEDIATE; - - off_reg = issrc ? insn->src_reg : insn->dst_reg; - if (isimm) { - *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); - } else { - if (isneg) - *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); - *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); - *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); - *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); - *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); - *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); - *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg); - } - if (!issrc) - *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg); - insn->src_reg = BPF_REG_AX; - if (isneg) - insn->code = insn->code == code_add ? - code_sub : code_add; - *patch++ = *insn; - if (issrc && isneg && !isimm) - *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); - cnt = patch - insn_buf; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) { - int stack_off_cnt = -stack_depth - 16; - - /* - * Two 8 byte slots, depth-16 stores the count, and - * depth-8 stores the start timestamp of the loop. - * - * The starting value of count is BPF_MAX_TIMED_LOOPS - * (0xffff). Every iteration loads it and subs it by 1, - * until the value becomes 0 in AX (thus, 1 in stack), - * after which we call arch_bpf_timed_may_goto, which - * either sets AX to 0xffff to keep looping, or to 0 - * upon timeout. AX is then stored into the stack. In - * the next iteration, we either see 0 and break out, or - * continue iterating until the next time value is 0 - * after subtraction, rinse and repeat. - */ - stack_depth_extra = 16; - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt); - if (insn->off >= 0) - insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5); - else - insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); - insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); - insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2); - /* - * AX is used as an argument to pass in stack_off_cnt - * (to add to r10/fp), and also as the return value of - * the call to arch_bpf_timed_may_goto. - */ - insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt); - insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto); - insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt); - cnt = 7; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } else if (is_may_goto_insn(insn)) { - int stack_off = -stack_depth - 8; - - stack_depth_extra = 8; - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off); - if (insn->off >= 0) - insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 2); - else - insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); - insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); - insn_buf[3] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off); - cnt = 4; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - if (insn->code != (BPF_JMP | BPF_CALL)) - goto next_insn; - if (insn->src_reg == BPF_PSEUDO_CALL) - goto next_insn; - if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { - ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt); - if (ret) - return ret; - if (cnt == 0) - goto next_insn; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Skip inlining the helper call if the JIT does it. */ - if (bpf_jit_inlines_helper_call(insn->imm)) - goto next_insn; - - if (insn->imm == BPF_FUNC_get_route_realm) - prog->dst_needed = 1; - if (insn->imm == BPF_FUNC_get_prandom_u32) - bpf_user_rnd_init_once(); - if (insn->imm == BPF_FUNC_override_return) - prog->kprobe_override = 1; - if (insn->imm == BPF_FUNC_tail_call) { - /* If we tail call into other programs, we - * cannot make any assumptions since they can - * be replaced dynamically during runtime in - * the program array. - */ - prog->cb_access = 1; - if (!allow_tail_call_in_subprogs(env)) - prog->aux->stack_depth = MAX_BPF_STACK; - prog->aux->max_pkt_offset = MAX_PACKET_OFF; - - /* mark bpf_tail_call as different opcode to avoid - * conditional branch in the interpreter for every normal - * call and to prevent accidental JITing by JIT compiler - * that doesn't support bpf_tail_call yet - */ - insn->imm = 0; - insn->code = BPF_JMP | BPF_TAIL_CALL; - - aux = &env->insn_aux_data[i + delta]; - if (env->bpf_capable && !prog->blinding_requested && - prog->jit_requested && - !bpf_map_key_poisoned(aux) && - !bpf_map_ptr_poisoned(aux) && - !bpf_map_ptr_unpriv(aux)) { - struct bpf_jit_poke_descriptor desc = { - .reason = BPF_POKE_REASON_TAIL_CALL, - .tail_call.map = aux->map_ptr_state.map_ptr, - .tail_call.key = bpf_map_key_immediate(aux), - .insn_idx = i + delta, - }; - - ret = bpf_jit_add_poke_descriptor(prog, &desc); - if (ret < 0) { - verbose(env, "adding tail call poke descriptor failed\n"); - return ret; - } - - insn->imm = ret + 1; - goto next_insn; - } - - if (!bpf_map_ptr_unpriv(aux)) - goto next_insn; - - /* instead of changing every JIT dealing with tail_call - * emit two extra insns: - * if (index >= max_entries) goto out; - * index &= array->index_mask; - * to avoid out-of-bounds cpu speculation - */ - if (bpf_map_ptr_poisoned(aux)) { - verbose(env, "tail_call abusing map_ptr\n"); - return -EINVAL; - } - - map_ptr = aux->map_ptr_state.map_ptr; - insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, - map_ptr->max_entries, 2); - insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, - container_of(map_ptr, - struct bpf_array, - map)->index_mask); - insn_buf[2] = *insn; - cnt = 3; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - if (insn->imm == BPF_FUNC_timer_set_callback) { - /* The verifier will process callback_fn as many times as necessary - * with different maps and the register states prepared by - * set_timer_callback_state will be accurate. - * - * The following use case is valid: - * map1 is shared by prog1, prog2, prog3. - * prog1 calls bpf_timer_init for some map1 elements - * prog2 calls bpf_timer_set_callback for some map1 elements. - * Those that were not bpf_timer_init-ed will return -EINVAL. - * prog3 calls bpf_timer_start for some map1 elements. - * Those that were not both bpf_timer_init-ed and - * bpf_timer_set_callback-ed will return -EINVAL. - */ - struct bpf_insn ld_addrs[2] = { - BPF_LD_IMM64(BPF_REG_3, (long)prog->aux), - }; - - insn_buf[0] = ld_addrs[0]; - insn_buf[1] = ld_addrs[1]; - insn_buf[2] = *insn; - cnt = 3; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto patch_call_imm; - } - - if (is_storage_get_function(insn->imm)) { - if (env->insn_aux_data[i + delta].non_sleepable) - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); - else - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); - insn_buf[1] = *insn; - cnt = 2; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto patch_call_imm; - } - - /* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */ - if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) { - /* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data, - * bpf_mem_alloc() returns a ptr to the percpu data ptr. - */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); - insn_buf[1] = *insn; - cnt = 2; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto patch_call_imm; - } - - /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup - * and other inlining handlers are currently limited to 64 bit - * only. - */ - if (prog->jit_requested && BITS_PER_LONG == 64 && - (insn->imm == BPF_FUNC_map_lookup_elem || - insn->imm == BPF_FUNC_map_update_elem || - insn->imm == BPF_FUNC_map_delete_elem || - insn->imm == BPF_FUNC_map_push_elem || - insn->imm == BPF_FUNC_map_pop_elem || - insn->imm == BPF_FUNC_map_peek_elem || - insn->imm == BPF_FUNC_redirect_map || - insn->imm == BPF_FUNC_for_each_map_elem || - insn->imm == BPF_FUNC_map_lookup_percpu_elem)) { - aux = &env->insn_aux_data[i + delta]; - if (bpf_map_ptr_poisoned(aux)) - goto patch_call_imm; - - map_ptr = aux->map_ptr_state.map_ptr; - ops = map_ptr->ops; - if (insn->imm == BPF_FUNC_map_lookup_elem && - ops->map_gen_lookup) { - cnt = ops->map_gen_lookup(map_ptr, insn_buf); - if (cnt == -EOPNOTSUPP) - goto patch_map_ops_generic; - if (cnt <= 0 || cnt >= INSN_BUF_SIZE) { - verifier_bug(env, "%d insns generated for map lookup", cnt); - return -EFAULT; - } - - new_prog = bpf_patch_insn_data(env, i + delta, - insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, - (void *(*)(struct bpf_map *map, void *key))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_delete_elem, - (long (*)(struct bpf_map *map, void *key))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_update_elem, - (long (*)(struct bpf_map *map, void *key, void *value, - u64 flags))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_push_elem, - (long (*)(struct bpf_map *map, void *value, - u64 flags))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_pop_elem, - (long (*)(struct bpf_map *map, void *value))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_peek_elem, - (long (*)(struct bpf_map *map, void *value))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_redirect, - (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_for_each_callback, - (long (*)(struct bpf_map *map, - bpf_callback_t callback_fn, - void *callback_ctx, - u64 flags))NULL)); - BUILD_BUG_ON(!__same_type(ops->map_lookup_percpu_elem, - (void *(*)(struct bpf_map *map, void *key, u32 cpu))NULL)); - -patch_map_ops_generic: - switch (insn->imm) { - case BPF_FUNC_map_lookup_elem: - insn->imm = BPF_CALL_IMM(ops->map_lookup_elem); - goto next_insn; - case BPF_FUNC_map_update_elem: - insn->imm = BPF_CALL_IMM(ops->map_update_elem); - goto next_insn; - case BPF_FUNC_map_delete_elem: - insn->imm = BPF_CALL_IMM(ops->map_delete_elem); - goto next_insn; - case BPF_FUNC_map_push_elem: - insn->imm = BPF_CALL_IMM(ops->map_push_elem); - goto next_insn; - case BPF_FUNC_map_pop_elem: - insn->imm = BPF_CALL_IMM(ops->map_pop_elem); - goto next_insn; - case BPF_FUNC_map_peek_elem: - insn->imm = BPF_CALL_IMM(ops->map_peek_elem); - goto next_insn; - case BPF_FUNC_redirect_map: - insn->imm = BPF_CALL_IMM(ops->map_redirect); - goto next_insn; - case BPF_FUNC_for_each_map_elem: - insn->imm = BPF_CALL_IMM(ops->map_for_each_callback); - goto next_insn; - case BPF_FUNC_map_lookup_percpu_elem: - insn->imm = BPF_CALL_IMM(ops->map_lookup_percpu_elem); - goto next_insn; - } - - goto patch_call_imm; - } - - /* Implement bpf_jiffies64 inline. */ - if (prog->jit_requested && BITS_PER_LONG == 64 && - insn->imm == BPF_FUNC_jiffies64) { - struct bpf_insn ld_jiffies_addr[2] = { - BPF_LD_IMM64(BPF_REG_0, - (unsigned long)&jiffies), - }; - - insn_buf[0] = ld_jiffies_addr[0]; - insn_buf[1] = ld_jiffies_addr[1]; - insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, - BPF_REG_0, 0); - cnt = 3; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, - cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - -#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) - /* Implement bpf_get_smp_processor_id() inline. */ - if (insn->imm == BPF_FUNC_get_smp_processor_id && - verifier_inlines_helper_call(env, insn->imm)) { - /* BPF_FUNC_get_smp_processor_id inlining is an - * optimization, so if cpu_number is ever - * changed in some incompatible and hard to support - * way, it's fine to back out this inlining logic - */ -#ifdef CONFIG_SMP - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number); - insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); - insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); - cnt = 3; -#else - insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); - cnt = 1; -#endif - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement bpf_get_current_task() and bpf_get_current_task_btf() inline. */ - if ((insn->imm == BPF_FUNC_get_current_task || insn->imm == BPF_FUNC_get_current_task_btf) && - verifier_inlines_helper_call(env, insn->imm)) { - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)¤t_task); - insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); - insn_buf[2] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); - cnt = 3; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } -#endif - /* Implement bpf_get_func_arg inline. */ - if (prog_type == BPF_PROG_TYPE_TRACING && - insn->imm == BPF_FUNC_get_func_arg) { - if (eatype == BPF_TRACE_RAW_TP) { - int nr_args = btf_type_vlen(prog->aux->attach_func_proto); - - /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); - cnt = 1; - } else { - /* Load nr_args from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); - insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); - cnt = 2; - } - insn_buf[cnt++] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); - insn_buf[cnt++] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); - insn_buf[cnt++] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); - insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0); - insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); - insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 0); - insn_buf[cnt++] = BPF_JMP_A(1); - insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement bpf_get_func_ret inline. */ - if (prog_type == BPF_PROG_TYPE_TRACING && - insn->imm == BPF_FUNC_get_func_ret) { - if (eatype == BPF_TRACE_FEXIT || - eatype == BPF_TRACE_FSESSION || - eatype == BPF_MODIFY_RETURN) { - /* Load nr_args from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); - insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); - insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); - insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); - insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); - insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0); - insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0); - cnt = 7; - } else { - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP); - cnt = 1; - } - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement get_func_arg_cnt inline. */ - if (prog_type == BPF_PROG_TYPE_TRACING && - insn->imm == BPF_FUNC_get_func_arg_cnt) { - if (eatype == BPF_TRACE_RAW_TP) { - int nr_args = btf_type_vlen(prog->aux->attach_func_proto); - - /* skip 'void *__data' in btf_trace_##name() and save to reg0 */ - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, nr_args - 1); - cnt = 1; - } else { - /* Load nr_args from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); - insn_buf[1] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); - cnt = 2; - } - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement bpf_get_func_ip inline. */ - if (prog_type == BPF_PROG_TYPE_TRACING && - insn->imm == BPF_FUNC_get_func_ip) { - /* Load IP address from ctx - 16 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16); - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); - if (!new_prog) - return -ENOMEM; - - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement bpf_get_branch_snapshot inline. */ - if (IS_ENABLED(CONFIG_PERF_EVENTS) && - prog->jit_requested && BITS_PER_LONG == 64 && - insn->imm == BPF_FUNC_get_branch_snapshot) { - /* We are dealing with the following func protos: - * u64 bpf_get_branch_snapshot(void *buf, u32 size, u64 flags); - * int perf_snapshot_branch_stack(struct perf_branch_entry *entries, u32 cnt); - */ - const u32 br_entry_size = sizeof(struct perf_branch_entry); - - /* struct perf_branch_entry is part of UAPI and is - * used as an array element, so extremely unlikely to - * ever grow or shrink - */ - BUILD_BUG_ON(br_entry_size != 24); - - /* if (unlikely(flags)) return -EINVAL */ - insn_buf[0] = BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 0, 7); - - /* Transform size (bytes) into number of entries (cnt = size / 24). - * But to avoid expensive division instruction, we implement - * divide-by-3 through multiplication, followed by further - * division by 8 through 3-bit right shift. - * Refer to book "Hacker's Delight, 2nd ed." by Henry S. Warren, Jr., - * p. 227, chapter "Unsigned Division by 3" for details and proofs. - * - * N / 3 <=> M * N / 2^33, where M = (2^33 + 1) / 3 = 0xaaaaaaab. - */ - insn_buf[1] = BPF_MOV32_IMM(BPF_REG_0, 0xaaaaaaab); - insn_buf[2] = BPF_ALU64_REG(BPF_MUL, BPF_REG_2, BPF_REG_0); - insn_buf[3] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36); - - /* call perf_snapshot_branch_stack implementation */ - insn_buf[4] = BPF_EMIT_CALL(static_call_query(perf_snapshot_branch_stack)); - /* if (entry_cnt == 0) return -ENOENT */ - insn_buf[5] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4); - /* return entry_cnt * sizeof(struct perf_branch_entry) */ - insn_buf[6] = BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, br_entry_size); - insn_buf[7] = BPF_JMP_A(3); - /* return -EINVAL; */ - insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); - insn_buf[9] = BPF_JMP_A(1); - /* return -ENOENT; */ - insn_buf[10] = BPF_MOV64_IMM(BPF_REG_0, -ENOENT); - cnt = 11; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } - - /* Implement bpf_kptr_xchg inline */ - if (prog->jit_requested && BITS_PER_LONG == 64 && - insn->imm == BPF_FUNC_kptr_xchg && - bpf_jit_supports_ptr_xchg()) { - insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_2); - insn_buf[1] = BPF_ATOMIC_OP(BPF_DW, BPF_XCHG, BPF_REG_1, BPF_REG_0, 0); - cnt = 2; - - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - goto next_insn; - } -patch_call_imm: - fn = env->ops->get_func_proto(insn->imm, env->prog); - /* all functions that have prototype and verifier allowed - * programs to call them, must be real in-kernel functions - */ - if (!fn->func) { - verifier_bug(env, - "not inlined functions %s#%d is missing func", - func_id_name(insn->imm), insn->imm); - return -EFAULT; - } - insn->imm = fn->func - __bpf_call_base; -next_insn: - if (subprogs[cur_subprog + 1].start == i + delta + 1) { - subprogs[cur_subprog].stack_depth += stack_depth_extra; - subprogs[cur_subprog].stack_extra = stack_depth_extra; - - stack_depth = subprogs[cur_subprog].stack_depth; - if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) { - verbose(env, "stack size %d(extra %d) is too large\n", - stack_depth, stack_depth_extra); - return -EINVAL; - } - cur_subprog++; - stack_depth = subprogs[cur_subprog].stack_depth; - stack_depth_extra = 0; - } - i++; - insn++; - } - - env->prog->aux->stack_depth = subprogs[0].stack_depth; - for (i = 0; i < env->subprog_cnt; i++) { - int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1; - int subprog_start = subprogs[i].start; - int stack_slots = subprogs[i].stack_extra / 8; - int slots = delta, cnt = 0; - - if (!stack_slots) - continue; - /* We need two slots in case timed may_goto is supported. */ - if (stack_slots > slots) { - verifier_bug(env, "stack_slots supports may_goto only"); - return -EFAULT; - } - - stack_depth = subprogs[i].stack_depth; - if (bpf_jit_supports_timed_may_goto()) { - insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, - BPF_MAX_TIMED_LOOPS); - insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0); - } else { - /* Add ST insn to subprog prologue to init extra stack */ - insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, - BPF_MAX_LOOPS); - } - /* Copy first actual insn to preserve it */ - insn_buf[cnt++] = env->prog->insnsi[subprog_start]; - - new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt); - if (!new_prog) - return -ENOMEM; - env->prog = prog = new_prog; - /* - * If may_goto is a first insn of a prog there could be a jmp - * insn that points to it, hence adjust all such jmps to point - * to insn after BPF_ST that inits may_goto count. - * Adjustment will succeed because bpf_patch_insn_data() didn't fail. - */ - WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta)); - } - - /* Since poke tab is now finalized, publish aux to tracker. */ - for (i = 0; i < prog->aux->size_poke_tab; i++) { - map_ptr = prog->aux->poke_tab[i].tail_call.map; - if (!map_ptr->ops->map_poke_track || - !map_ptr->ops->map_poke_untrack || - !map_ptr->ops->map_poke_run) { - verifier_bug(env, "poke tab is misconfigured"); - return -EFAULT; - } - - ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux); - if (ret < 0) { - verbose(env, "tracking tail call prog failed\n"); - return ret; - } - } - - ret = sort_kfunc_descs_by_imm_off(env); - if (ret) - return ret; - - return 0; -} - -static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, - int position, - s32 stack_base, - u32 callback_subprogno, - u32 *total_cnt) -{ - s32 r6_offset = stack_base + 0 * BPF_REG_SIZE; - s32 r7_offset = stack_base + 1 * BPF_REG_SIZE; - s32 r8_offset = stack_base + 2 * BPF_REG_SIZE; - int reg_loop_max = BPF_REG_6; - int reg_loop_cnt = BPF_REG_7; - int reg_loop_ctx = BPF_REG_8; - - struct bpf_insn *insn_buf = env->insn_buf; - struct bpf_prog *new_prog; - u32 callback_start; - u32 call_insn_offset; - s32 callback_offset; - u32 cnt = 0; - - /* This represents an inlined version of bpf_iter.c:bpf_loop, - * be careful to modify this code in sync. - */ - /* Return error and jump to the end of the patch if - * expected number of iterations is too big. - */ - insn_buf[cnt++] = BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2); - insn_buf[cnt++] = BPF_MOV32_IMM(BPF_REG_0, -E2BIG); - insn_buf[cnt++] = BPF_JMP_IMM(BPF_JA, 0, 0, 16); - /* spill R6, R7, R8 to use these as loop vars */ - insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset); - insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset); - insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset); - /* initialize loop vars */ - insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_max, BPF_REG_1); - insn_buf[cnt++] = BPF_MOV32_IMM(reg_loop_cnt, 0); - insn_buf[cnt++] = BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3); - /* loop header, - * if reg_loop_cnt >= reg_loop_max skip the loop body - */ - insn_buf[cnt++] = BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5); - /* callback call, - * correct callback offset would be set after patching - */ - insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt); - insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx); - insn_buf[cnt++] = BPF_CALL_REL(0); - /* increment loop counter */ - insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1); - /* jump to loop header if callback returned 0 */ - insn_buf[cnt++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6); - /* return value of bpf_loop, - * set R0 to the number of iterations - */ - insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt); - /* restore original values of R6, R7, R8 */ - insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset); - insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset); - insn_buf[cnt++] = BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset); - - *total_cnt = cnt; - new_prog = bpf_patch_insn_data(env, position, insn_buf, cnt); - if (!new_prog) - return new_prog; - - /* callback start is known only after patching */ - callback_start = env->subprog_info[callback_subprogno].start; - /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */ - call_insn_offset = position + 12; - callback_offset = callback_start - call_insn_offset - 1; - new_prog->insnsi[call_insn_offset].imm = callback_offset; - - return new_prog; -} - -static bool is_bpf_loop_call(struct bpf_insn *insn) -{ - return insn->code == (BPF_JMP | BPF_CALL) && - insn->src_reg == 0 && - insn->imm == BPF_FUNC_loop; -} - -/* For all sub-programs in the program (including main) check - * insn_aux_data to see if there are bpf_loop calls that require - * inlining. If such calls are found the calls are replaced with a - * sequence of instructions produced by `inline_bpf_loop` function and - * subprog stack_depth is increased by the size of 3 registers. - * This stack space is used to spill values of the R6, R7, R8. These - * registers are used to store the loop bound, counter and context - * variables. - */ -static int optimize_bpf_loop(struct bpf_verifier_env *env) -{ - struct bpf_subprog_info *subprogs = env->subprog_info; - int i, cur_subprog = 0, cnt, delta = 0; - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - u16 stack_depth = subprogs[cur_subprog].stack_depth; - u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; - u16 stack_depth_extra = 0; - - for (i = 0; i < insn_cnt; i++, insn++) { - struct bpf_loop_inline_state *inline_state = - &env->insn_aux_data[i + delta].loop_inline_state; - - if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) { - struct bpf_prog *new_prog; - - stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup; - new_prog = inline_bpf_loop(env, - i + delta, - -(stack_depth + stack_depth_extra), - inline_state->callback_subprogno, - &cnt); - if (!new_prog) - return -ENOMEM; - - delta += cnt - 1; - env->prog = new_prog; - insn = new_prog->insnsi + i + delta; - } - - if (subprogs[cur_subprog + 1].start == i + delta + 1) { - subprogs[cur_subprog].stack_depth += stack_depth_extra; - cur_subprog++; - stack_depth = subprogs[cur_subprog].stack_depth; - stack_depth_roundup = round_up(stack_depth, 8) - stack_depth; - stack_depth_extra = 0; - } - } - - env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; - - return 0; -} - -/* Remove unnecessary spill/fill pairs, members of fastcall pattern, - * adjust subprograms stack depth when possible. - */ -static int remove_fastcall_spills_fills(struct bpf_verifier_env *env) -{ - struct bpf_subprog_info *subprog = env->subprog_info; - struct bpf_insn_aux_data *aux = env->insn_aux_data; - struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; - u32 spills_num; - bool modified = false; - int i, j; - - for (i = 0; i < insn_cnt; i++, insn++) { - if (aux[i].fastcall_spills_num > 0) { - spills_num = aux[i].fastcall_spills_num; - /* NOPs would be removed by opt_remove_nops() */ - for (j = 1; j <= spills_num; ++j) { - *(insn - j) = NOP; - *(insn + j) = NOP; - } - modified = true; - } - if ((subprog + 1)->start == i + 1) { - if (modified && !subprog->keep_fastcall_stack) - subprog->stack_depth = -subprog->fastcall_stack_off; - subprog++; - modified = false; - } - } - - return 0; -} static void free_states(struct bpf_verifier_env *env) { @@ -24428,13 +18635,13 @@ static void free_states(struct bpf_verifier_env *env) struct bpf_scc_info *info; int i, j; - free_verifier_state(env->cur_state, true); + bpf_free_verifier_state(env->cur_state, true); env->cur_state = NULL; while (!pop_stack(env, NULL, NULL, false)); list_for_each_safe(pos, tmp, &env->free_list) { sl = container_of(pos, struct bpf_verifier_state_list, node); - free_verifier_state(&sl->state, false); + bpf_free_verifier_state(&sl->state, false); kfree(sl); } INIT_LIST_HEAD(&env->free_list); @@ -24444,7 +18651,7 @@ static void free_states(struct bpf_verifier_env *env) if (!info) continue; for (j = 0; j < info->num_visits; j++) - free_backedges(&info->visits[j]); + bpf_free_backedges(&info->visits[j]); kvfree(info); env->scc_info[i] = NULL; } @@ -24457,7 +18664,7 @@ static void free_states(struct bpf_verifier_env *env) list_for_each_safe(pos, tmp, head) { sl = container_of(pos, struct bpf_verifier_state_list, node); - free_verifier_state(&sl->state, false); + bpf_free_verifier_state(&sl->state, false); kfree(sl); } INIT_LIST_HEAD(&env->explored_states[i]); @@ -24510,10 +18717,18 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) if (subprog_is_exc_cb(env, subprog)) { state->frame[0]->in_exception_callback_fn = true; - /* We have already ensured that the callback returns an integer, just - * like all global subprogs. We need to determine it only has a single - * scalar argument. + + /* + * Global functions are scalar or void, make sure + * we return a scalar. */ + if (subprog_returns_void(env, subprog)) { + verbose(env, "exception cb cannot return void\n"); + ret = -EINVAL; + goto out; + } + + /* Also ensure the callback only has a single scalar argument. */ if (sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_ANYTHING) { verbose(env, "exception cb only supports single integer argument\n"); ret = -EINVAL; @@ -24630,7 +18845,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env) again: new_cnt = 0; for (i = 1; i < env->subprog_cnt; i++) { - if (!subprog_is_global(env, i)) + if (!bpf_subprog_is_global(env, i)) continue; sub_aux = subprog_aux(env, i); @@ -24792,7 +19007,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) { - if (st_ops_desc->arg_info[member_idx].info->refcounted) { + if (st_ops_desc->arg_info[member_idx].info[i].refcounted) { has_refcounted_arg = true; break; } @@ -24820,14 +19035,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } #define SECURITY_PREFIX "security_" -static int check_attach_modify_return(unsigned long addr, const char *func_name) -{ - if (within_error_injection_list(addr) || - !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1)) - return 0; - - return -EINVAL; -} +#ifdef CONFIG_FUNCTION_ERROR_INJECTION /* list of non-sleepable functions that are otherwise on * ALLOW_ERROR_INJECTION list @@ -24850,6 +19058,75 @@ static int check_non_sleepable_error_inject(u32 btf_id) return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id); } +static int check_attach_sleepable(u32 btf_id, unsigned long addr, const char *func_name) +{ + /* fentry/fexit/fmod_ret progs can be sleepable if they are + * attached to ALLOW_ERROR_INJECTION and are not in denylist. + */ + if (!check_non_sleepable_error_inject(btf_id) && + within_error_injection_list(addr)) + return 0; + + return -EINVAL; +} + +static int check_attach_modify_return(unsigned long addr, const char *func_name) +{ + if (within_error_injection_list(addr) || + !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1)) + return 0; + + return -EINVAL; +} + +#else + +/* Unfortunately, the arch-specific prefixes are hard-coded in arch syscall code + * so we need to hard-code them, too. Ftrace has arch_syscall_match_sym_name() + * but that just compares two concrete function names. + */ +static bool has_arch_syscall_prefix(const char *func_name) +{ +#if defined(__x86_64__) + return !strncmp(func_name, "__x64_", 6); +#elif defined(__i386__) + return !strncmp(func_name, "__ia32_", 7); +#elif defined(__s390x__) + return !strncmp(func_name, "__s390x_", 8); +#elif defined(__aarch64__) + return !strncmp(func_name, "__arm64_", 8); +#elif defined(__riscv) + return !strncmp(func_name, "__riscv_", 8); +#elif defined(__powerpc__) || defined(__powerpc64__) + return !strncmp(func_name, "sys_", 4); +#elif defined(__loongarch__) + return !strncmp(func_name, "sys_", 4); +#else + return false; +#endif +} + +/* Without error injection, allow sleepable and fmod_ret progs on syscalls. */ + +static int check_attach_sleepable(u32 btf_id, unsigned long addr, const char *func_name) +{ + if (has_arch_syscall_prefix(func_name)) + return 0; + + return -EINVAL; +} + +static int check_attach_modify_return(unsigned long addr, const char *func_name) +{ + if (has_arch_syscall_prefix(func_name) || + !strncmp(SECURITY_PREFIX, func_name, sizeof(SECURITY_PREFIX) - 1)) + return 0; + + return -EINVAL; +} + +#endif /* CONFIG_FUNCTION_ERROR_INJECTION */ + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, @@ -24876,7 +19153,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf; if (!btf) { bpf_log(log, - "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); + "Tracing program can only be attached to another program annotated with BTF\n"); return -EINVAL; } t = btf_type_by_id(btf, btf_id); @@ -24912,7 +19189,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (aux->func && aux->func[subprog]->aux->exception_cb) { bpf_log(log, "%s programs cannot attach to exception callback\n", - prog_extension ? "Extension" : "FENTRY/FEXIT"); + prog_extension ? "Extension" : "Tracing"); return -EINVAL; } conservative = aux->func_info_aux[subprog].unreliable; @@ -25001,7 +19278,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_RAW_TP: if (tgt_prog) { bpf_log(log, - "Only FENTRY/FEXIT progs are attachable to another BPF prog\n"); + "Only FENTRY/FEXIT/FSESSION progs are attachable to another BPF prog\n"); return -EINVAL; } if (!btf_type_is_typedef(t)) { @@ -25129,12 +19406,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, ret = -EINVAL; switch (prog->type) { case BPF_PROG_TYPE_TRACING: - - /* fentry/fexit/fmod_ret progs can be sleepable if they are - * attached to ALLOW_ERROR_INJECTION and are not in denylist. - */ - if (!check_non_sleepable_error_inject(btf_id) && - within_error_injection_list(addr)) + if (!check_attach_sleepable(btf_id, addr, tname)) ret = 0; /* fentry/fexit/fmod_ret progs can also be sleepable if they are * in the fmodret id set with the KF_SLEEPABLE flag. @@ -25231,7 +19503,6 @@ BTF_ID(func, __x64_sys_exit_group) BTF_ID(func, do_exit) BTF_ID(func, do_group_exit) BTF_ID(func, kthread_complete_and_exit) -BTF_ID(func, kthread_exit) BTF_ID(func, make_task_dead) BTF_SET_END(noreturn_deny) @@ -25273,7 +19544,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->sleepable && !can_be_sleepable(prog)) { - verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); + verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); return -EINVAL; } @@ -25421,430 +19692,209 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } -/* Each field is a register bitmask */ -struct insn_live_regs { - u16 use; /* registers read by instruction */ - u16 def; /* registers written by instruction */ - u16 in; /* registers that may be alive before instruction */ - u16 out; /* registers that may be alive after instruction */ -}; +/* replace a generic kfunc with a specialized version if necessary */ +static int specialize_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_desc *desc, int insn_idx) +{ + struct bpf_prog *prog = env->prog; + bool seen_direct_write; + void *xdp_kfunc; + bool is_rdonly; + u32 func_id = desc->func_id; + u16 offset = desc->offset; + unsigned long addr = desc->addr; -/* Bitmask with 1s for all caller saved registers */ -#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) + if (offset) /* return if module BTF is used */ + return 0; -/* Compute info->{use,def} fields for the instruction */ -static void compute_insn_live_regs(struct bpf_verifier_env *env, - struct bpf_insn *insn, - struct insn_live_regs *info) -{ - struct call_summary cs; - u8 class = BPF_CLASS(insn->code); - u8 code = BPF_OP(insn->code); - u8 mode = BPF_MODE(insn->code); - u16 src = BIT(insn->src_reg); - u16 dst = BIT(insn->dst_reg); - u16 r0 = BIT(0); - u16 def = 0; - u16 use = 0xffff; + if (bpf_dev_bound_kfunc_id(func_id)) { + xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id); + if (xdp_kfunc) + addr = (unsigned long)xdp_kfunc; + /* fallback to default kfunc when not supported by netdev */ + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) { + seen_direct_write = env->seen_direct_write; + is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE); - switch (class) { - case BPF_LD: - switch (mode) { - case BPF_IMM: - if (BPF_SIZE(insn->code) == BPF_DW) { - def = dst; - use = 0; - } - break; - case BPF_LD | BPF_ABS: - case BPF_LD | BPF_IND: - /* stick with defaults */ - break; - } - break; - case BPF_LDX: - switch (mode) { - case BPF_MEM: - case BPF_MEMSX: - def = dst; - use = src; - break; - } - break; - case BPF_ST: - switch (mode) { - case BPF_MEM: - def = 0; - use = dst; - break; - } - break; - case BPF_STX: - switch (mode) { - case BPF_MEM: - def = 0; - use = dst | src; - break; - case BPF_ATOMIC: - switch (insn->imm) { - case BPF_CMPXCHG: - use = r0 | dst | src; - def = r0; - break; - case BPF_LOAD_ACQ: - def = dst; - use = src; - break; - case BPF_STORE_REL: - def = 0; - use = dst | src; - break; - default: - use = dst | src; - if (insn->imm & BPF_FETCH) - def = src; - else - def = 0; - } - break; - } - break; - case BPF_ALU: - case BPF_ALU64: - switch (code) { - case BPF_END: - use = dst; - def = dst; - break; - case BPF_MOV: - def = dst; - if (BPF_SRC(insn->code) == BPF_K) - use = 0; - else - use = src; - break; - default: - def = dst; - if (BPF_SRC(insn->code) == BPF_K) - use = dst; - else - use = dst | src; - } - break; - case BPF_JMP: - case BPF_JMP32: - switch (code) { - case BPF_JA: - def = 0; - if (BPF_SRC(insn->code) == BPF_X) - use = dst; - else - use = 0; - break; - case BPF_JCOND: - def = 0; - use = 0; - break; - case BPF_EXIT: - def = 0; - use = r0; - break; - case BPF_CALL: - def = ALL_CALLER_SAVED_REGS; - use = def & ~BIT(BPF_REG_0); - if (get_call_summary(env, insn, &cs)) - use = GENMASK(cs.num_params, 1); - break; - default: - def = 0; - if (BPF_SRC(insn->code) == BPF_K) - use = dst; - else - use = dst | src; - } - break; + if (is_rdonly) + addr = (unsigned long)bpf_dynptr_from_skb_rdonly; + + /* restore env->seen_direct_write to its original value, since + * may_access_direct_pkt_data mutates it + */ + env->seen_direct_write = seen_direct_write; + } else if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr]) { + if (bpf_lsm_has_d_inode_locked(prog)) + addr = (unsigned long)bpf_set_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr]) { + if (bpf_lsm_has_d_inode_locked(prog)) + addr = (unsigned long)bpf_remove_dentry_xattr_locked; + } else if (func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { + if (!env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_dynptr_from_file_sleepable; + } else if (func_id == special_kfunc_list[KF_bpf_arena_alloc_pages]) { + if (env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_arena_alloc_pages_non_sleepable; + } else if (func_id == special_kfunc_list[KF_bpf_arena_free_pages]) { + if (env->insn_aux_data[insn_idx].non_sleepable) + addr = (unsigned long)bpf_arena_free_pages_non_sleepable; } + desc->addr = addr; + return 0; +} - info->def = def; - info->use = use; +static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, + u16 struct_meta_reg, + u16 node_offset_reg, + struct bpf_insn *insn, + struct bpf_insn *insn_buf, + int *cnt) +{ + struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta; + struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) }; + + insn_buf[0] = addr[0]; + insn_buf[1] = addr[1]; + insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off); + insn_buf[3] = *insn; + *cnt = 4; } -/* Compute may-live registers after each instruction in the program. - * The register is live after the instruction I if it is read by some - * instruction S following I during program execution and is not - * overwritten between I and S. - * - * Store result in env->insn_aux_data[i].live_regs. - */ -static int compute_live_registers(struct bpf_verifier_env *env) +int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + struct bpf_insn *insn_buf, int insn_idx, int *cnt) { - struct bpf_insn_aux_data *insn_aux = env->insn_aux_data; - struct bpf_insn *insns = env->prog->insnsi; - struct insn_live_regs *state; - int insn_cnt = env->prog->len; - int err = 0, i, j; - bool changed; - - /* Use the following algorithm: - * - define the following: - * - I.use : a set of all registers read by instruction I; - * - I.def : a set of all registers written by instruction I; - * - I.in : a set of all registers that may be alive before I execution; - * - I.out : a set of all registers that may be alive after I execution; - * - insn_successors(I): a set of instructions S that might immediately - * follow I for some program execution; - * - associate separate empty sets 'I.in' and 'I.out' with each instruction; - * - visit each instruction in a postorder and update - * state[i].in, state[i].out as follows: - * - * state[i].out = U [state[s].in for S in insn_successors(i)] - * state[i].in = (state[i].out / state[i].def) U state[i].use - * - * (where U stands for set union, / stands for set difference) - * - repeat the computation while {in,out} fields changes for - * any instruction. + struct bpf_kfunc_desc *desc; + int err; + + if (!insn->imm) { + verbose(env, "invalid kernel function call not eliminated in verifier pass\n"); + return -EINVAL; + } + + *cnt = 0; + + /* insn->imm has the btf func_id. Replace it with an offset relative to + * __bpf_call_base, unless the JIT needs to call functions that are + * further than 32 bits away (bpf_jit_supports_far_kfunc_call()). */ - state = kvzalloc_objs(*state, insn_cnt, GFP_KERNEL_ACCOUNT); - if (!state) { - err = -ENOMEM; - goto out; + desc = find_kfunc_desc(env->prog, insn->imm, insn->off); + if (!desc) { + verifier_bug(env, "kernel function descriptor not found for func_id %u", + insn->imm); + return -EFAULT; } - for (i = 0; i < insn_cnt; ++i) - compute_insn_live_regs(env, &insns[i], &state[i]); - - changed = true; - while (changed) { - changed = false; - for (i = 0; i < env->cfg.cur_postorder; ++i) { - int insn_idx = env->cfg.insn_postorder[i]; - struct insn_live_regs *live = &state[insn_idx]; - struct bpf_iarray *succ; - u16 new_out = 0; - u16 new_in = 0; - - succ = bpf_insn_successors(env, insn_idx); - for (int s = 0; s < succ->cnt; ++s) - new_out |= state[succ->items[s]].in; - new_in = (new_out & ~live->def) | live->use; - if (new_out != live->out || new_in != live->in) { - live->in = new_in; - live->out = new_out; - changed = true; - } + err = specialize_kfunc(env, desc, insn_idx); + if (err) + return err; + + if (!bpf_jit_supports_far_kfunc_call()) + insn->imm = BPF_CALL_IMM(desc->addr); + + if (is_bpf_obj_new_kfunc(desc->func_id) || is_bpf_percpu_obj_new_kfunc(desc->func_id)) { + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; + struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; + u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; + + if (is_bpf_percpu_obj_new_kfunc(desc->func_id) && kptr_struct_meta) { + verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d", + insn_idx); + return -EFAULT; } - } - for (i = 0; i < insn_cnt; ++i) - insn_aux[i].live_regs_before = state[i].in; + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size); + insn_buf[1] = addr[0]; + insn_buf[2] = addr[1]; + insn_buf[3] = *insn; + *cnt = 4; + } else if (is_bpf_obj_drop_kfunc(desc->func_id) || + is_bpf_percpu_obj_drop_kfunc(desc->func_id) || + is_bpf_refcount_acquire_kfunc(desc->func_id)) { + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; + struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; - if (env->log.level & BPF_LOG_LEVEL2) { - verbose(env, "Live regs before insn:\n"); - for (i = 0; i < insn_cnt; ++i) { - if (env->insn_aux_data[i].scc) - verbose(env, "%3d ", env->insn_aux_data[i].scc); - else - verbose(env, " "); - verbose(env, "%3d: ", i); - for (j = BPF_REG_0; j < BPF_REG_10; ++j) - if (insn_aux[i].live_regs_before & BIT(j)) - verbose(env, "%d", j); - else - verbose(env, "."); - verbose(env, " "); - verbose_insn(env, &insns[i]); - if (bpf_is_ldimm64(&insns[i])) - i++; + if (is_bpf_percpu_obj_drop_kfunc(desc->func_id) && kptr_struct_meta) { + verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d", + insn_idx); + return -EFAULT; } - } -out: - kvfree(state); - return err; -} + if (is_bpf_refcount_acquire_kfunc(desc->func_id) && !kptr_struct_meta) { + verifier_bug(env, "kptr_struct_meta expected at insn_idx %d", + insn_idx); + return -EFAULT; + } -/* - * Compute strongly connected components (SCCs) on the CFG. - * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc. - * If instruction is a sole member of its SCC and there are no self edges, - * assign it SCC number of zero. - * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation. - */ -static int compute_scc(struct bpf_verifier_env *env) -{ - const u32 NOT_ON_STACK = U32_MAX; + insn_buf[0] = addr[0]; + insn_buf[1] = addr[1]; + insn_buf[2] = *insn; + *cnt = 3; + } else if (is_bpf_list_push_kfunc(desc->func_id) || + is_bpf_rbtree_add_kfunc(desc->func_id)) { + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; + int struct_meta_reg = BPF_REG_3; + int node_offset_reg = BPF_REG_4; - struct bpf_insn_aux_data *aux = env->insn_aux_data; - const u32 insn_cnt = env->prog->len; - int stack_sz, dfs_sz, err = 0; - u32 *stack, *pre, *low, *dfs; - u32 i, j, t, w; - u32 next_preorder_num; - u32 next_scc_id; - bool assign_scc; - struct bpf_iarray *succ; - - next_preorder_num = 1; - next_scc_id = 1; - /* - * - 'stack' accumulates vertices in DFS order, see invariant comment below; - * - 'pre[t] == p' => preorder number of vertex 't' is 'p'; - * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n'; - * - 'dfs' DFS traversal stack, used to emulate explicit recursion. - */ - stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); - pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); - low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); - dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT); - if (!stack || !pre || !low || !dfs) { - err = -ENOMEM; - goto exit; + /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */ + if (is_bpf_rbtree_add_kfunc(desc->func_id)) { + struct_meta_reg = BPF_REG_4; + node_offset_reg = BPF_REG_5; + } + + if (!kptr_struct_meta) { + verifier_bug(env, "kptr_struct_meta expected at insn_idx %d", + insn_idx); + return -EFAULT; + } + + __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg, + node_offset_reg, insn, insn_buf, cnt); + } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || + desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { + insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); + *cnt = 1; + } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] && + env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + /* + * inline the bpf_session_is_return() for fsession: + * bool bpf_session_is_return(void *ctx) + * { + * return (((u64 *)ctx)[-1] >> BPF_TRAMP_IS_RETURN_SHIFT) & 1; + * } + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_IS_RETURN_SHIFT); + insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); + *cnt = 3; + } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] && + env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + /* + * inline bpf_session_cookie() for fsession: + * __u64 *bpf_session_cookie(void *ctx) + * { + * u64 off = (((u64 *)ctx)[-1] >> BPF_TRAMP_COOKIE_INDEX_SHIFT) & 0xFF; + * return &((u64 *)ctx)[-off]; + * } + */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, BPF_TRAMP_COOKIE_INDEX_SHIFT); + insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFF); + insn_buf[3] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[4] = BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1); + insn_buf[5] = BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0); + *cnt = 6; } - /* - * References: - * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms" - * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components" - * - * The algorithm maintains the following invariant: - * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]'; - * - then, vertex 'u' remains on stack while vertex 'v' is on stack. - * - * Consequently: - * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u', - * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack, - * and thus there is an SCC (loop) containing both 'u' and 'v'. - * - If 'low[v] == pre[v]', loops containing 'v' have been explored, - * and 'v' can be considered the root of some SCC. - * - * Here is a pseudo-code for an explicitly recursive version of the algorithm: - * - * NOT_ON_STACK = insn_cnt + 1 - * pre = [0] * insn_cnt - * low = [0] * insn_cnt - * scc = [0] * insn_cnt - * stack = [] - * - * next_preorder_num = 1 - * next_scc_id = 1 - * - * def recur(w): - * nonlocal next_preorder_num - * nonlocal next_scc_id - * - * pre[w] = next_preorder_num - * low[w] = next_preorder_num - * next_preorder_num += 1 - * stack.append(w) - * for s in successors(w): - * # Note: for classic algorithm the block below should look as: - * # - * # if pre[s] == 0: - * # recur(s) - * # low[w] = min(low[w], low[s]) - * # elif low[s] != NOT_ON_STACK: - * # low[w] = min(low[w], pre[s]) - * # - * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])' - * # does not break the invariant and makes itartive version of the algorithm - * # simpler. See 'Algorithm #3' from [2]. - * - * # 's' not yet visited - * if pre[s] == 0: - * recur(s) - * # if 's' is on stack, pick lowest reachable preorder number from it; - * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]', - * # so 'min' would be a noop. - * low[w] = min(low[w], low[s]) - * - * if low[w] == pre[w]: - * # 'w' is the root of an SCC, pop all vertices - * # below 'w' on stack and assign same SCC to them. - * while True: - * t = stack.pop() - * low[t] = NOT_ON_STACK - * scc[t] = next_scc_id - * if t == w: - * break - * next_scc_id += 1 - * - * for i in range(0, insn_cnt): - * if pre[i] == 0: - * recur(i) - * - * Below implementation replaces explicit recursion with array 'dfs'. - */ - for (i = 0; i < insn_cnt; i++) { - if (pre[i]) - continue; - stack_sz = 0; - dfs_sz = 1; - dfs[0] = i; -dfs_continue: - while (dfs_sz) { - w = dfs[dfs_sz - 1]; - if (pre[w] == 0) { - low[w] = next_preorder_num; - pre[w] = next_preorder_num; - next_preorder_num++; - stack[stack_sz++] = w; - } - /* Visit 'w' successors */ - succ = bpf_insn_successors(env, w); - for (j = 0; j < succ->cnt; ++j) { - if (pre[succ->items[j]]) { - low[w] = min(low[w], low[succ->items[j]]); - } else { - dfs[dfs_sz++] = succ->items[j]; - goto dfs_continue; - } - } - /* - * Preserve the invariant: if some vertex above in the stack - * is reachable from 'w', keep 'w' on the stack. - */ - if (low[w] < pre[w]) { - dfs_sz--; - goto dfs_continue; - } - /* - * Assign SCC number only if component has two or more elements, - * or if component has a self reference, or if instruction is a - * callback calling function (implicit loop). - */ - assign_scc = stack[stack_sz - 1] != w; /* two or more elements? */ - for (j = 0; j < succ->cnt; ++j) { /* self reference? */ - if (succ->items[j] == w) { - assign_scc = true; - break; - } - } - if (bpf_calls_callback(env, w)) /* implicit loop? */ - assign_scc = true; - /* Pop component elements from stack */ - do { - t = stack[--stack_sz]; - low[t] = NOT_ON_STACK; - if (assign_scc) - aux[t].scc = next_scc_id; - } while (t != w); - if (assign_scc) - next_scc_id++; - dfs_sz--; - } - } - env->scc_info = kvzalloc_objs(*env->scc_info, next_scc_id, - GFP_KERNEL_ACCOUNT); - if (!env->scc_info) { - err = -ENOMEM; - goto exit; - } - env->scc_cnt = next_scc_id; -exit: - kvfree(stack); - kvfree(pre); - kvfree(low); - kvfree(dfs); - return err; + + if (env->insn_aux_data[insn_idx].arg_prog) { + u32 regno = env->insn_aux_data[insn_idx].arg_prog; + struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(regno, (long)env->prog->aux) }; + int idx = *cnt; + + insn_buf[idx++] = ld_addrs[0]; + insn_buf[idx++] = ld_addrs[1]; + insn_buf[idx++] = *insn; + *cnt = idx; + } + return 0; } int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) @@ -25878,7 +19928,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 goto err_free_env; for (i = 0; i < len; i++) env->insn_aux_data[i].orig_idx = i; - env->succ = iarray_realloc(NULL, 2); + env->succ = bpf_iarray_realloc(NULL, 2); if (!env->succ) goto err_free_env; env->prog = *prog; @@ -25939,7 +19989,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 INIT_LIST_HEAD(&env->explored_states[i]); INIT_LIST_HEAD(&env->free_list); - ret = check_btf_info_early(env, attr, uattr); + ret = bpf_check_btf_info_early(env, attr, uattr); if (ret < 0) goto skip_full_check; @@ -25951,11 +20001,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = check_btf_info(env, attr, uattr); + ret = bpf_check_btf_info(env, attr, uattr); if (ret < 0) goto skip_full_check; - ret = resolve_pseudo_ldimm64(env); + ret = check_and_resolve_insns(env); if (ret < 0) goto skip_full_check; @@ -25965,11 +20015,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 goto skip_full_check; } - ret = check_cfg(env); + ret = bpf_check_cfg(env); if (ret < 0) goto skip_full_check; - ret = compute_postorder(env); + ret = bpf_compute_postorder(env); if (ret < 0) goto skip_full_check; @@ -25981,11 +20031,23 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret) goto skip_full_check; - ret = compute_scc(env); + ret = bpf_compute_const_regs(env); + if (ret < 0) + goto skip_full_check; + + ret = bpf_prune_dead_branches(env); + if (ret < 0) + goto skip_full_check; + + ret = sort_subprogs_topo(env); + if (ret < 0) + goto skip_full_check; + + ret = bpf_compute_scc(env); if (ret < 0) goto skip_full_check; - ret = compute_live_registers(env); + ret = bpf_compute_live_registers(env); if (ret < 0) goto skip_full_check; @@ -26006,22 +20068,22 @@ skip_full_check: * allocate additional slots. */ if (ret == 0) - ret = remove_fastcall_spills_fills(env); + ret = bpf_remove_fastcall_spills_fills(env); if (ret == 0) ret = check_max_stack_depth(env); /* instruction rewrites happen after this point */ if (ret == 0) - ret = optimize_bpf_loop(env); + ret = bpf_optimize_bpf_loop(env); if (is_priv) { if (ret == 0) - opt_hard_wire_dead_code_branches(env); + bpf_opt_hard_wire_dead_code_branches(env); if (ret == 0) - ret = opt_remove_dead_code(env); + ret = bpf_opt_remove_dead_code(env); if (ret == 0) - ret = opt_remove_nops(env); + ret = bpf_opt_remove_nops(env); } else { if (ret == 0) sanitize_dead_code(env); @@ -26029,22 +20091,22 @@ skip_full_check: if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ - ret = convert_ctx_accesses(env); + ret = bpf_convert_ctx_accesses(env); if (ret == 0) - ret = do_misc_fixups(env); + ret = bpf_do_misc_fixups(env); /* do 32-bit optimization after insn patching has done so those patched * insns could be handled correctly. */ if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) { - ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); + ret = bpf_opt_subreg_zext_lo32_rnd_hi32(env, attr); env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret : false; } if (ret == 0) - ret = fixup_call_args(env); + ret = bpf_fixup_call_args(env); env->verification_time = ktime_get_ns() - start_time; print_verification_stats(env); @@ -26103,6 +20165,14 @@ skip_full_check: adjust_btf_func(env); + /* extension progs temporarily inherit the attach_type of their targets + for verification purposes, so set it back to zero before returning + */ + if (env->prog->type == BPF_PROG_TYPE_EXT) + env->prog->expected_attach_type = 0; + + env->prog = __bpf_prog_select_runtime(env, env->prog, &ret); + err_release_maps: if (ret) release_insn_arrays(env); @@ -26114,19 +20184,13 @@ err_release_maps: if (!env->prog->aux->used_btfs) release_btfs(env); - /* extension progs temporarily inherit the attach_type of their targets - for verification purposes, so set it back to zero before returning - */ - if (env->prog->type == BPF_PROG_TYPE_EXT) - env->prog->expected_attach_type = 0; - *prog = env->prog; module_put(env->attach_btf_mod); err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); - clear_insn_aux_data(env, 0, env->prog->len); + bpf_clear_insn_aux_data(env, 0, env->prog->len); vfree(env->insn_aux_data); err_free_env: bpf_stack_liveness_free(env); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 3bfe37693d68..58797123b752 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -184,11 +184,6 @@ extern bool cgrp_dfl_visible; for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) -static inline bool cgroup_is_dead(const struct cgroup *cgrp) -{ - return !(cgrp->self.flags & CSS_ONLINE); -} - static inline bool notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -222,7 +217,6 @@ static inline void get_css_set(struct css_set *cset) } bool cgroup_ssid_enabled(int ssid); -bool cgroup_on_dfl(const struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c22cda7766d8..1f084ee71443 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -69,14 +69,6 @@ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* - * To avoid confusing the compiler (and generating warnings) with code - * that attempts to access what would be a 0-element array (i.e. sized - * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this - * constant expression can be added. - */ -#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) - -/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * @@ -107,12 +99,6 @@ static bool cgroup_debug __read_mostly; */ static DEFINE_SPINLOCK(cgroup_idr_lock); -/* - * Protects cgroup_file->kn for !self csses. It synchronizes notifications - * against file removal/re-creation across css hiding. - */ -static DEFINE_SPINLOCK(cgroup_file_kn_lock); - DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ @@ -510,27 +496,6 @@ static u32 cgroup_ss_mask(struct cgroup *cgrp) } /** - * cgroup_css - obtain a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns @cgrp->self) - * - * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This - * function must be called either under cgroup_mutex or rcu_read_lock() and - * the caller is responsible for pinning the returned css if it wants to - * keep accessing it outside the said locks. This function may return - * %NULL if @cgrp doesn't have @subsys_id enabled. - */ -static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - if (CGROUP_HAS_SUBSYS_CONFIG && ss) - return rcu_dereference_check(cgrp->subsys[ss->id], - lockdep_is_held(&cgroup_mutex)); - else - return &cgrp->self; -} - -/** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest * @ss: the subsystem of interest (%NULL returns @cgrp->self) @@ -741,32 +706,6 @@ EXPORT_SYMBOL_GPL(of_css); } \ } while (false) -/* iterate over child cgrps, lock should be held throughout iteration */ -#define cgroup_for_each_live_child(child, cgrp) \ - list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - cgroup_is_dead(child); })) \ - ; \ - else - -/* walk live descendants in pre order */ -#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ - css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - (dsct) = (d_css)->cgroup; \ - cgroup_is_dead(dsct); })) \ - ; \ - else - -/* walk live descendants in postorder */ -#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ - css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - (dsct) = (d_css)->cgroup; \ - cgroup_is_dead(dsct); })) \ - ; \ - else - /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state @@ -1748,9 +1687,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); struct cgroup_file *cfile = (void *)css + cft->file_offset; - spin_lock_irq(&cgroup_file_kn_lock); - cfile->kn = NULL; - spin_unlock_irq(&cgroup_file_kn_lock); + spin_lock_irq(&cfile->lock); + WRITE_ONCE(cfile->kn, NULL); + spin_unlock_irq(&cfile->lock); timer_delete_sync(&cfile->notify_timer); } @@ -2126,6 +2065,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) #endif init_waitqueue_head(&cgrp->offline_waitq); + init_waitqueue_head(&cgrp->dying_populated_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } @@ -2608,6 +2548,7 @@ static void cgroup_migrate_add_task(struct task_struct *task, mgctx->tset.nr_tasks++; + css_set_skip_task_iters(cset, task); list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, @@ -4427,10 +4368,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); - - spin_lock_irq(&cgroup_file_kn_lock); + spin_lock_init(&cfile->lock); cfile->kn = kn; - spin_unlock_irq(&cgroup_file_kn_lock); } return 0; @@ -4685,21 +4624,32 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) */ void cgroup_file_notify(struct cgroup_file *cfile) { - unsigned long flags; + unsigned long flags, last, next; + struct kernfs_node *kn = NULL; + + if (!READ_ONCE(cfile->kn)) + return; - spin_lock_irqsave(&cgroup_file_kn_lock, flags); + last = READ_ONCE(cfile->notified_at); + next = last + CGROUP_FILE_NOTIFY_MIN_INTV; + if (time_in_range(jiffies, last, next)) { + timer_reduce(&cfile->notify_timer, next); + if (timer_pending(&cfile->notify_timer)) + return; + } + + spin_lock_irqsave(&cfile->lock, flags); if (cfile->kn) { - unsigned long last = cfile->notified_at; - unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; + kn = cfile->kn; + kernfs_get(kn); + WRITE_ONCE(cfile->notified_at, jiffies); + } + spin_unlock_irqrestore(&cfile->lock, flags); - if (time_in_range(jiffies, last, next)) { - timer_reduce(&cfile->notify_timer, next); - } else { - kernfs_notify(cfile->kn); - cfile->notified_at = jiffies; - } + if (kn) { + kernfs_notify(kn); + kernfs_put(kn); } - spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } EXPORT_SYMBOL_GPL(cgroup_file_notify); @@ -4712,10 +4662,10 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; - spin_lock_irq(&cgroup_file_kn_lock); + spin_lock_irq(&cfile->lock); kn = cfile->kn; kernfs_get(kn); - spin_unlock_irq(&cgroup_file_kn_lock); + spin_unlock_irq(&cfile->lock); if (kn) kernfs_show(kn, show); @@ -5108,6 +5058,12 @@ repeat: return; task = list_entry(it->task_pos, struct task_struct, cg_list); + /* + * Hide tasks that are exiting but not yet removed. Keep zombie + * leaders with live threads visible. + */ + if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) + goto repeat; if (it->flags & CSS_TASK_ITER_PROCS) { /* if PROCS, skip over tasks which aren't group leaders */ @@ -6217,6 +6173,78 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return 0; }; +/** + * cgroup_drain_dying - wait for dying tasks to leave before rmdir + * @cgrp: the cgroup being removed + * + * cgroup.procs and cgroup.threads use css_task_iter which filters out + * PF_EXITING tasks so that userspace doesn't see tasks that have already been + * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the + * cgroup has non-empty css_sets - is only updated when dying tasks pass through + * cgroup_task_dead() in finish_task_switch(). This creates a window where + * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir + * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no + * tasks. + * + * This function aligns cgroup_has_tasks() with what userspace can observe. If + * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are + * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the + * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief. + * + * This function only concerns itself with this cgroup's own dying tasks. + * Whether the cgroup has children is cgroup_destroy_locked()'s problem. + * + * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we + * retry the full check from scratch. + * + * Must be called with cgroup_mutex held. + */ +static int cgroup_drain_dying(struct cgroup *cgrp) + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +{ + struct css_task_iter it; + struct task_struct *task; + DEFINE_WAIT(wait); + + lockdep_assert_held(&cgroup_mutex); +retry: + if (!cgroup_has_tasks(cgrp)) + return 0; + + /* Same iterator as cgroup.threads - if any task is visible, it's busy */ + css_task_iter_start(&cgrp->self, 0, &it); + task = css_task_iter_next(&it); + css_task_iter_end(&it); + + if (task) + return -EBUSY; + + /* + * All remaining tasks are PF_EXITING and will pass through + * cgroup_task_dead() shortly. Wait for a kick and retry. + * + * cgroup_has_tasks() can't transition from false to true while we're + * holding cgroup_mutex, but the true to false transition happens + * under css_set_lock (via cgroup_task_dead()). We must retest and + * prepare_to_wait() under css_set_lock. Otherwise, the transition + * can happen between our first test and prepare_to_wait(), and we + * sleep with no one to wake us. + */ + spin_lock_irq(&css_set_lock); + if (!cgroup_has_tasks(cgrp)) { + spin_unlock_irq(&css_set_lock); + return 0; + } + prepare_to_wait(&cgrp->dying_populated_waitq, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&css_set_lock); + mutex_unlock(&cgroup_mutex); + schedule(); + finish_wait(&cgrp->dying_populated_waitq, &wait); + mutex_lock(&cgroup_mutex); + goto retry; +} + int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; @@ -6226,9 +6254,12 @@ int cgroup_rmdir(struct kernfs_node *kn) if (!cgrp) return 0; - ret = cgroup_destroy_locked(cgrp); - if (!ret) - TRACE_CGROUP_PATH(rmdir, cgrp); + ret = cgroup_drain_dying(cgrp); + if (!ret) { + ret = cgroup_destroy_locked(cgrp); + if (!ret) + TRACE_CGROUP_PATH(rmdir, cgrp); + } cgroup_kn_unlock(kn); return ret; @@ -6988,6 +7019,7 @@ void cgroup_task_exit(struct task_struct *tsk) static void do_cgroup_task_dead(struct task_struct *tsk) { + struct cgrp_cset_link *link; struct css_set *cset; unsigned long flags; @@ -7001,6 +7033,11 @@ static void do_cgroup_task_dead(struct task_struct *tsk) if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) list_add_tail(&tsk->cg_list, &cset->dying_tasks); + /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */ + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) + if (waitqueue_active(&link->cgrp->dying_populated_waitq)) + wake_up(&link->cgrp->dying_populated_waitq); + if (dl_task(tsk)) dec_dl_tasks_cs(tsk); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 9faf34377a88..1335e437098e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -62,6 +62,75 @@ static const char * const perr_strings[] = { }; /* + * CPUSET Locking Convention + * ------------------------- + * + * Below are the four global/local locks guarding cpuset structures in lock + * acquisition order: + * - cpuset_top_mutex + * - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock) + * - cpuset_mutex + * - callback_lock (raw spinlock) + * + * As cpuset will now indirectly flush a number of different workqueues in + * housekeeping_update() to update housekeeping cpumasks when the set of + * isolated CPUs is going to be changed, it may be vulnerable to deadlock + * if we hold cpus_read_lock while calling into housekeeping_update(). + * + * The first cpuset_top_mutex will be held except when calling into + * cpuset_handle_hotplug() from the CPU hotplug code where cpus_write_lock + * and cpuset_mutex will be held instead. The main purpose of this mutex + * is to prevent regular cpuset control file write actions from interfering + * with the call to housekeeping_update(), though CPU hotplug operation can + * still happen in parallel. This mutex also provides protection for some + * internal variables. + * + * A task must hold all the remaining three locks to modify externally visible + * or used fields of cpusets, though some of the internally used cpuset fields + * and internal variables can be modified without holding callback_lock. If only + * reliable read access of the externally used fields are needed, a task can + * hold either cpuset_mutex or callback_lock which are exposed to other + * external subsystems. + * + * If a task holds cpu_hotplug_lock and cpuset_mutex, it blocks others, + * ensuring that it is the only task able to also acquire callback_lock and + * be able to modify cpusets. It can perform various checks on the cpuset + * structure first, knowing nothing will change. It can also allocate memory + * without holding callback_lock. While it is performing these checks, various + * callback routines can briefly acquire callback_lock to query cpusets. Once + * it is ready to make the changes, it takes callback_lock, blocking everyone + * else. + * + * Calls to the kernel memory allocator cannot be made while holding + * callback_lock which is a spinlock, as the memory allocator may sleep or + * call back into cpuset code and acquire callback_lock. + * + * Now, the task_struct fields mems_allowed and mempolicy may be changed + * by other task, we use alloc_lock in the task_struct fields to protect + * them. + * + * The cpuset_common_seq_show() handlers only hold callback_lock across + * small pieces of code, such as when reading out possibly multi-word + * cpumasks and nodemasks. + */ + +static DEFINE_MUTEX(cpuset_top_mutex); +static DEFINE_MUTEX(cpuset_mutex); + +/* + * File level internal variables below follow one of the following exclusion + * rules. + * + * RWCS: Read/write-able by holding either cpus_write_lock (and optionally + * cpuset_mutex) or both cpus_read_lock and cpuset_mutex. + * + * CSCB: Readable by holding either cpuset_mutex or callback_lock. Writable + * by holding both cpuset_mutex and callback_lock. + * + * T: Read/write-able by holding the cpuset_top_mutex. + */ + +/* * For local partitions, update to subpartitions_cpus & isolated_cpus is done * in update_parent_effective_cpumask(). For remote partitions, it is done in * the remote_partition_*() and remote_cpus_update() helpers. @@ -70,19 +139,22 @@ static const char * const perr_strings[] = { * Exclusive CPUs distributed out to local or remote sub-partitions of * top_cpuset */ -static cpumask_var_t subpartitions_cpus; +static cpumask_var_t subpartitions_cpus; /* RWCS */ /* - * Exclusive CPUs in isolated partitions + * Exclusive CPUs in isolated partitions (shown in cpuset.cpus.isolated) */ -static cpumask_var_t isolated_cpus; +static cpumask_var_t isolated_cpus; /* CSCB */ /* - * isolated_cpus updating flag (protected by cpuset_mutex) - * Set if isolated_cpus is going to be updated in the current - * cpuset_mutex crtical section. + * Set if housekeeping cpumasks are to be updated. */ -static bool isolated_cpus_updating; +static bool update_housekeeping; /* RWCS */ + +/* + * Copy of isolated_cpus to be passed to housekeeping_update() + */ +static cpumask_var_t isolated_hk_cpus; /* T */ /* * A flag to force sched domain rebuild at the end of an operation. @@ -98,7 +170,7 @@ static bool isolated_cpus_updating; * Note that update_relax_domain_level() in cpuset-v1.c can still call * rebuild_sched_domains_locked() directly without using this flag. */ -static bool force_sd_rebuild; +static bool force_sd_rebuild; /* RWCS */ /* * Partition root states: @@ -218,42 +290,6 @@ struct cpuset top_cpuset = { .partition_root_state = PRS_ROOT, }; -/* - * There are two global locks guarding cpuset structures - cpuset_mutex and - * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel - * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset - * structures. Note that cpuset_mutex needs to be a mutex as it is used in - * paths that rely on priority inheritance (e.g. scheduler - on RT) for - * correctness. - * - * A task must hold both locks to modify cpusets. If a task holds - * cpuset_mutex, it blocks others, ensuring that it is the only task able to - * also acquire callback_lock and be able to modify cpusets. It can perform - * various checks on the cpuset structure first, knowing nothing will change. - * It can also allocate memory while just holding cpuset_mutex. While it is - * performing these checks, various callback routines can briefly acquire - * callback_lock to query cpusets. Once it is ready to make the changes, it - * takes callback_lock, blocking everyone else. - * - * Calls to the kernel memory allocator can not be made while holding - * callback_lock, as that would risk double tripping on callback_lock - * from one of the callbacks into the cpuset code from within - * __alloc_pages(). - * - * If a task is only holding callback_lock, then it has read-only - * access to cpusets. - * - * Now, the task_struct fields mems_allowed and mempolicy may be changed - * by other task, we use alloc_lock in the task_struct fields to protect - * them. - * - * The cpuset_common_seq_show() handlers only hold callback_lock across - * small pieces of code, such as when reading out possibly multi-word - * cpumasks and nodemasks. - */ - -static DEFINE_MUTEX(cpuset_mutex); - /** * cpuset_lock - Acquire the global cpuset mutex * @@ -283,6 +319,7 @@ void lockdep_assert_cpuset_lock_held(void) */ void cpuset_full_lock(void) { + mutex_lock(&cpuset_top_mutex); cpus_read_lock(); mutex_lock(&cpuset_mutex); } @@ -291,12 +328,14 @@ void cpuset_full_unlock(void) { mutex_unlock(&cpuset_mutex); cpus_read_unlock(); + mutex_unlock(&cpuset_top_mutex); } #ifdef CONFIG_LOCKDEP bool lockdep_is_cpuset_held(void) { - return lockdep_is_held(&cpuset_mutex); + return lockdep_is_held(&cpuset_mutex) || + lockdep_is_held(&cpuset_top_mutex); } #endif @@ -840,7 +879,7 @@ generate_doms: /* * Cgroup v2 doesn't support domain attributes, just set all of them * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a - * subset of HK_TYPE_DOMAIN housekeeping CPUs. + * subset of HK_TYPE_DOMAIN_BOOT housekeeping CPUs. */ for (i = 0; i < ndoms; i++) { /* @@ -849,7 +888,7 @@ generate_doms: */ if (!csa || csa[i] == &top_cpuset) cpumask_and(doms[i], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_TYPE_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT)); else cpumask_copy(doms[i], csa[i]->effective_cpus); if (dattr) @@ -961,7 +1000,7 @@ void rebuild_sched_domains_locked(void) * offline CPUs, a warning is emitted and we return directly to * prevent the panic. */ - for (i = 0; i < ndoms; ++i) { + for (i = 0; doms && i < ndoms; i++) { if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) return; } @@ -1161,12 +1200,18 @@ static void reset_partition_data(struct cpuset *cs) static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) { WARN_ON_ONCE(old_prs == new_prs); - if (new_prs == PRS_ISOLATED) + lockdep_assert_held(&callback_lock); + lockdep_assert_held(&cpuset_mutex); + if (new_prs == PRS_ISOLATED) { + if (cpumask_subset(xcpus, isolated_cpus)) + return; cpumask_or(isolated_cpus, isolated_cpus, xcpus); - else + } else { + if (!cpumask_intersects(xcpus, isolated_cpus)) + return; cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); - - isolated_cpus_updating = true; + } + update_housekeeping = true; } /* @@ -1219,8 +1264,8 @@ static void partition_xcpus_del(int old_prs, struct cpuset *parent, isolated_cpus_update(old_prs, parent->partition_root_state, xcpus); - cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); + cpumask_and(parent->effective_cpus, parent->effective_cpus, cpu_active_mask); } /* @@ -1284,22 +1329,45 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) } /* - * update_isolation_cpumasks - Update external isolation related CPU masks + * cpuset_update_sd_hk_unlock - Rebuild sched domains, update HK & unlock * - * The following external CPU masks will be updated if necessary: - * - workqueue unbound cpumask + * Update housekeeping cpumasks and rebuild sched domains if necessary and + * then do a cpuset_full_unlock(). + * This should be called at the end of cpuset operation. */ -static void update_isolation_cpumasks(void) +static void cpuset_update_sd_hk_unlock(void) + __releases(&cpuset_mutex) + __releases(&cpuset_top_mutex) { - int ret; + /* force_sd_rebuild will be cleared in rebuild_sched_domains_locked() */ + if (force_sd_rebuild) + rebuild_sched_domains_locked(); - if (!isolated_cpus_updating) - return; + if (update_housekeeping) { + update_housekeeping = false; + cpumask_copy(isolated_hk_cpus, isolated_cpus); - ret = housekeeping_update(isolated_cpus); - WARN_ON_ONCE(ret < 0); + /* + * housekeeping_update() is now called without holding + * cpus_read_lock and cpuset_mutex. Only cpuset_top_mutex + * is still being held for mutual exclusion. + */ + mutex_unlock(&cpuset_mutex); + cpus_read_unlock(); + WARN_ON_ONCE(housekeeping_update(isolated_hk_cpus)); + mutex_unlock(&cpuset_top_mutex); + } else { + cpuset_full_unlock(); + } +} - isolated_cpus_updating = false; +/* + * Work function to invoke cpuset_update_sd_hk_unlock() + */ +static void hk_sd_workfn(struct work_struct *work) +{ + cpuset_full_lock(); + cpuset_update_sd_hk_unlock(); } /** @@ -1450,7 +1518,6 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, cs->remote_partition = true; cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); cpuset_force_rebuild(); cs->prs_err = 0; @@ -1495,7 +1562,6 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); cpuset_force_rebuild(); /* @@ -1566,7 +1632,6 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, if (xcpus) cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); if (adding || deleting) cpuset_force_rebuild(); @@ -1910,7 +1975,6 @@ write_error: partition_xcpus_add(new_prs, parent, tmp->delmask); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive_flag(cs, new_prs); @@ -2155,7 +2219,7 @@ get_css: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - cpuset_update_tasks_cpumask(cp, cp->effective_cpus); + cpuset_update_tasks_cpumask(cp, tmp->new_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE @@ -2878,7 +2942,6 @@ out: else if (isolcpus_updated) isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(); /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); @@ -2925,7 +2988,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct cpuset *cs, *oldcs; struct task_struct *task; - bool cpus_updated, mems_updated; + bool setsched_check; int ret; /* used later by cpuset_attach() */ @@ -2940,20 +3003,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) if (ret) goto out_unlock; - cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); - mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + /* + * Skip rights over task setsched check in v2 when nothing changes, + * migration permission derives from hierarchy ownership in + * cgroup_procs_write_permission()). + */ + setsched_check = !cpuset_v2() || + !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) || + !nodes_equal(cs->effective_mems, oldcs->effective_mems); + + /* + * A v1 cpuset with tasks will have no CPU left only when CPU hotplug + * brings the last online CPU offline as users are not allowed to empty + * cpuset.cpus when there are active tasks inside. When that happens, + * we should allow tasks to migrate out without security check to make + * sure they will be able to run after migration. + */ + if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus)) + setsched_check = false; cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task); if (ret) goto out_unlock; - /* - * Skip rights over task check in v2 when nothing changes, - * migration permission derives from hierarchy ownership in - * cgroup_procs_write_permission()). - */ - if (!cpuset_v2() || (cpus_updated || mems_updated)) { + if (setsched_check) { ret = security_task_setscheduler(task); if (ret) goto out_unlock; @@ -3168,10 +3242,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, } free_cpuset(trialcs); - if (force_sd_rebuild) - rebuild_sched_domains_locked(); out_unlock: - cpuset_full_unlock(); + cpuset_update_sd_hk_unlock(); if (of_cft(of)->private == FILE_MEMLIST) schedule_flush_migrate_mm(); return retval ?: nbytes; @@ -3278,7 +3350,7 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, cpuset_full_lock(); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); - cpuset_full_unlock(); + cpuset_update_sd_hk_unlock(); return retval ?: nbytes; } @@ -3452,7 +3524,7 @@ static void cpuset_css_killed(struct cgroup_subsys_state *css) /* Reset valid partition back to member */ if (is_partition_valid(cs)) update_prstate(cs, PRS_MEMBER); - cpuset_full_unlock(); + cpuset_update_sd_hk_unlock(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -3607,6 +3679,7 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&isolated_hk_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); @@ -3778,6 +3851,7 @@ unlock: */ static void cpuset_handle_hotplug(void) { + static DECLARE_WORK(hk_sd_work, hk_sd_workfn); static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; @@ -3859,9 +3933,25 @@ static void cpuset_handle_hotplug(void) rcu_read_unlock(); } - /* rebuild sched domains if necessary */ + /* + * rebuild_sched_domains() will always be called directly if needed + * to make sure that newly added or removed CPU will be reflected in + * the sched domains. However, if isolated partition invalidation + * or recreation is being done (update_housekeeping set), a work item + * will be queued to call housekeeping_update() to update the + * corresponding housekeeping cpumasks after some slight delay. + * + * We rely on WORK_STRUCT_PENDING_BIT to not requeue a work item that + * is still pending. Before the pending bit is cleared, the work data + * is copied out and work item dequeued. So it is possible to queue + * the work again before the hk_sd_workfn() is invoked to process the + * previously queued work. Since hk_sd_workfn() doesn't use the work + * item at all, this is not a problem. + */ if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); + if (update_housekeeping) + queue_work(system_dfl_wq, &hk_sd_work); free_tmpmasks(ptmp); } diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 9d95824dc6fa..1ab1fb47f271 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -707,8 +707,7 @@ static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v) return 0; } -static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region, - u64 *new_limit) +static int dmemcg_parse_limit(char *options, u64 *new_limit) { char *end; @@ -762,7 +761,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, if (!region) return -EINVAL; - err = dmemcg_parse_limit(options, region, &new_limit); + err = dmemcg_parse_limit(options, &new_limit); if (err < 0) goto out_put; diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 09258eebb5c7..9967fb25c563 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -173,7 +173,7 @@ uncharge_cg_locked(struct rdma_cgroup *cg, * the system. */ if (unlikely(!rpool)) { - pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); + pr_warn("Invalid device %p or rdma cgroup %p\n", device, cg); return; } diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 774702591d26..307c97ac5fa9 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -29,7 +29,6 @@ CONFIG_SECTION_MISMATCH_WARN_ONLY=y # CONFIG_UBSAN_ALIGNMENT is not set # CONFIG_UBSAN_DIV_ZERO is not set # CONFIG_UBSAN_TRAP is not set -# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set CONFIG_DEBUG_FS=y CONFIG_DEBUG_FS_ALLOW_ALL=y CONFIG_DEBUG_IRQFLAGS=y diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 2c1a3791e410..4f21fc3b108b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -27,8 +27,6 @@ #include <asm/page.h> #include <asm/sections.h> -#include <crypto/sha1.h> - #include "kallsyms_internal.h" #include "kexec_internal.h" diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c index 1f4067fbdb94..cb875ddb6ba6 100644 --- a/kernel/crash_dump_dm_crypt.c +++ b/kernel/crash_dump_dm_crypt.c @@ -6,6 +6,7 @@ #include <linux/cc_platform.h> #include <linux/configfs.h> #include <linux/module.h> +#include <linux/sysfs.h> #define KEY_NUM_MAX 128 /* maximum dm crypt keys */ #define KEY_SIZE_MAX 256 /* maximum dm crypt key size */ @@ -115,7 +116,7 @@ static int restore_dm_crypt_keys_to_thread_keyring(void) addr = dm_crypt_keys_addr; dm_crypt_keys_read((char *)&key_count, sizeof(key_count), &addr); - if (key_count < 0 || key_count > KEY_NUM_MAX) { + if (key_count > KEY_NUM_MAX) { kexec_dprintk("Failed to read the number of dm-crypt keys\n"); return -1; } @@ -139,7 +140,7 @@ static int restore_dm_crypt_keys_to_thread_keyring(void) return 0; } -static int read_key_from_user_keying(struct dm_crypt_key *dm_key) +static int read_key_from_user_keyring(struct dm_crypt_key *dm_key) { const struct user_key_payload *ukp; struct key *key; @@ -168,8 +169,8 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key) memcpy(dm_key->data, ukp->data, ukp->datalen); dm_key->key_size = ukp->datalen; - kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size, - dm_key->key_desc, dm_key->data); + kexec_dprintk("Get dm crypt key (size=%u) %s\n", dm_key->key_size, + dm_key->key_desc); out: up_read(&key->sem); @@ -189,7 +190,7 @@ static inline struct config_key *to_config_key(struct config_item *item) static ssize_t config_key_description_show(struct config_item *item, char *page) { - return sprintf(page, "%s\n", to_config_key(item)->description); + return sysfs_emit(page, "%s\n", to_config_key(item)->description); } static ssize_t config_key_description_store(struct config_item *item, @@ -265,7 +266,7 @@ static struct config_item *config_keys_make_item(struct config_group *group, static ssize_t config_keys_count_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", key_count); + return sysfs_emit(page, "%d\n", key_count); } CONFIGFS_ATTR_RO(config_keys_, count); @@ -274,7 +275,7 @@ static bool is_dm_key_reused; static ssize_t config_keys_reuse_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", is_dm_key_reused); + return sysfs_emit(page, "%d\n", is_dm_key_reused); } static ssize_t config_keys_reuse_store(struct config_item *item, @@ -321,7 +322,7 @@ static bool restore; static ssize_t config_keys_restore_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", restore); + return sysfs_emit(page, "%d\n", restore); } static ssize_t config_keys_restore_store(struct config_item *item, @@ -387,7 +388,7 @@ static int build_keys_header(void) strscpy(keys_header->keys[i].key_desc, key->description, KEY_DESC_MAX_LEN); - r = read_key_from_user_keying(&keys_header->keys[i]); + r = read_key_from_user_keyring(&keys_header->keys[i]); if (r != 0) { kexec_dprintk("Failed to read key %s\n", keys_header->keys[i].key_desc); @@ -414,14 +415,16 @@ int crash_load_dm_crypt_keys(struct kimage *image) if (key_count <= 0) { kexec_dprintk("No dm-crypt keys\n"); - return -ENOENT; + return 0; } if (!is_dm_key_reused) { image->dm_crypt_keys_addr = 0; r = build_keys_header(); - if (r) + if (r) { + pr_err("Failed to build dm-crypt keys header, ret=%d\n", r); return r; + } } kbuf.buffer = keys_header; @@ -432,6 +435,7 @@ int crash_load_dm_crypt_keys(struct kimage *image) kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; r = kexec_add_buffer(&kbuf); if (r) { + pr_err("Failed to call kexec_add_buffer, ret=%d\n", r); kvfree((void *)kbuf.buffer); return r; } diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 62e60e0223cf..eee37a11380c 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -20,8 +20,6 @@ #include <asm/page.h> #include <asm/sections.h> -#include <crypto/sha1.h> - #include "kallsyms_internal.h" #include "kexec_internal.h" diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 159900736f25..bfef21b4a9ae 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -72,6 +72,9 @@ config ARCH_HAS_DMA_PREP_COHERENT config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool +config ARCH_HAS_BATCHED_DMA_SYNC + bool + # # Select this option if the architecture assumes DMA devices are coherent # by default. diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 1147497bc512..bcdc0f76d2e8 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -362,17 +362,11 @@ static void rmem_dma_device_release(struct reserved_mem *rmem, dev->dma_mem = NULL; } -static const struct reserved_mem_ops rmem_dma_ops = { - .device_init = rmem_dma_device_init, - .device_release = rmem_dma_device_release, -}; -static int __init rmem_dma_setup(struct reserved_mem *rmem) +static int __init rmem_dma_setup(unsigned long node, struct reserved_mem *rmem) { - unsigned long node = rmem->fdt_node; - if (of_get_flat_dt_prop(node, "reusable", NULL)) - return -EINVAL; + return -ENODEV; #ifdef CONFIG_ARM if (!of_get_flat_dt_prop(node, "no-map", NULL)) { @@ -390,7 +384,6 @@ static int __init rmem_dma_setup(struct reserved_mem *rmem) } #endif - rmem->ops = &rmem_dma_ops; pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); return 0; @@ -407,5 +400,11 @@ static int __init dma_init_reserved_memory(void) core_initcall(dma_init_reserved_memory); #endif /* CONFIG_DMA_GLOBAL_POOL */ -RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup); +static const struct reserved_mem_ops rmem_dma_ops = { + .node_init = rmem_dma_setup, + .device_init = rmem_dma_device_init, + .device_release = rmem_dma_device_release, +}; + +RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", &rmem_dma_ops); #endif diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index c56004d314dc..03f52bd17120 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -42,7 +42,6 @@ #include <linux/memblock.h> #include <linux/err.h> #include <linux/sizes.h> -#include <linux/dma-buf/heaps/cma.h> #include <linux/dma-map-ops.h> #include <linux/cma.h> #include <linux/nospec.h> @@ -53,7 +52,38 @@ #define CMA_SIZE_MBYTES 0 #endif -struct cma *dma_contiguous_default_area; +static struct cma *dma_contiguous_areas[MAX_CMA_AREAS]; +static unsigned int dma_contiguous_areas_num; + +static int dma_contiguous_insert_area(struct cma *cma) +{ + if (dma_contiguous_areas_num >= ARRAY_SIZE(dma_contiguous_areas)) + return -EINVAL; + + dma_contiguous_areas[dma_contiguous_areas_num++] = cma; + + return 0; +} + +/** + * dma_contiguous_get_area_by_idx() - Get contiguous area at given index + * @idx: index of the area we query + * + * Queries for the contiguous area located at index @idx. + * + * Returns: + * A pointer to the requested contiguous area, or NULL otherwise. + */ +struct cma *dma_contiguous_get_area_by_idx(unsigned int idx) +{ + if (idx >= dma_contiguous_areas_num) + return NULL; + + return dma_contiguous_areas[idx]; +} +EXPORT_SYMBOL_GPL(dma_contiguous_get_area_by_idx); + +static struct cma *dma_contiguous_default_area; /* * Default global CMA area size can be defined in kernel's .config. @@ -91,15 +121,14 @@ static int __init early_cma(char *p) } early_param("cma", early_cma); -/* - * cma_skip_dt_default_reserved_mem - This is called from the - * reserved_mem framework to detect if the default cma region is being - * set by the "cma=" kernel parameter. - */ -bool __init cma_skip_dt_default_reserved_mem(void) +struct cma *dev_get_cma_area(struct device *dev) { - return size_cmdline != -1; + if (dev && dev->cma_area) + return dev->cma_area; + + return dma_contiguous_default_area; } +EXPORT_SYMBOL_GPL(dev_get_cma_area); #ifdef CONFIG_DMA_NUMA_CMA @@ -264,9 +293,24 @@ void __init dma_contiguous_reserve(phys_addr_t limit) if (ret) return; - ret = dma_heap_cma_register_heap(dma_contiguous_default_area); + /* + * We need to insert the new area in our list to avoid + * any inconsistencies between having the default area + * listed in the DT or not. + * + * The DT case is handled by rmem_cma_setup() and will + * always insert all its areas in our list. However, if + * it didn't run (because OF_RESERVED_MEM isn't set, or + * there's no DT region specified), then we don't have a + * default area yet, and no area in our list. + * + * This block creates the default area in such a case, + * but we also need to insert it in our list to avoid + * having a default area but an empty list. + */ + ret = dma_contiguous_insert_area(dma_contiguous_default_area); if (ret) - pr_warn("Couldn't register default CMA heap."); + pr_warn("Couldn't queue default CMA region for heap creation."); } } @@ -470,47 +514,89 @@ static void rmem_cma_device_release(struct reserved_mem *rmem, dev->cma_area = NULL; } -static const struct reserved_mem_ops rmem_cma_ops = { - .device_init = rmem_cma_device_init, - .device_release = rmem_cma_device_release, -}; +static int __init __rmem_cma_verify_node(unsigned long node) +{ + if (!of_get_flat_dt_prop(node, "reusable", NULL) || + of_get_flat_dt_prop(node, "no-map", NULL)) + return -ENODEV; + + if (size_cmdline != -1 && + of_get_flat_dt_prop(node, "linux,cma-default", NULL)) { + pr_err("Skipping dt linux,cma-default node in favor for \"cma=\" kernel param.\n"); + return -EBUSY; + } + return 0; +} -static int __init rmem_cma_setup(struct reserved_mem *rmem) +static int __init rmem_cma_validate(unsigned long node, phys_addr_t *align) +{ + int ret = __rmem_cma_verify_node(node); + + if (ret) + return ret; + + if (align) + *align = max_t(phys_addr_t, *align, CMA_MIN_ALIGNMENT_BYTES); + + return 0; +} + +static int __init rmem_cma_fixup(unsigned long node, phys_addr_t base, + phys_addr_t size) +{ + int ret = __rmem_cma_verify_node(node); + + if (ret) + return ret; + + /* Architecture specific contiguous memory fixup. */ + dma_contiguous_early_fixup(base, size); + return 0; +} + +static int __init rmem_cma_setup(unsigned long node, struct reserved_mem *rmem) { - unsigned long node = rmem->fdt_node; bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; - int err; + int ret; - if (!of_get_flat_dt_prop(node, "reusable", NULL) || - of_get_flat_dt_prop(node, "no-map", NULL)) - return -EINVAL; + ret = __rmem_cma_verify_node(node); + if (ret) + return ret; if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) { pr_err("Reserved memory: incorrect alignment of CMA region\n"); return -EINVAL; } - err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma); - if (err) { + ret = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma); + if (ret) { pr_err("Reserved memory: unable to setup CMA region\n"); - return err; + return ret; } if (default_cma) dma_contiguous_default_area = cma; - rmem->ops = &rmem_cma_ops; rmem->priv = cma; pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); - err = dma_heap_cma_register_heap(cma); - if (err) - pr_warn("Couldn't register CMA heap."); + ret = dma_contiguous_insert_area(cma); + if (ret) + pr_warn("Couldn't store CMA reserved area."); return 0; } -RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup); + +static const struct reserved_mem_ops rmem_cma_ops = { + .node_validate = rmem_cma_validate, + .node_fixup = rmem_cma_fixup, + .node_init = rmem_cma_setup, + .device_init = rmem_cma_device_init, + .device_release = rmem_cma_device_release, +}; + +RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", &rmem_cma_ops); #endif diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 86f87e43438c..1a725edbbbf6 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -453,7 +453,7 @@ static int active_cacheline_set_overlap(phys_addr_t cln, int overlap) return overlap; } -static void active_cacheline_inc_overlap(phys_addr_t cln) +static void active_cacheline_inc_overlap(phys_addr_t cln, bool is_cache_clean) { int overlap = active_cacheline_read_overlap(cln); @@ -462,7 +462,7 @@ static void active_cacheline_inc_overlap(phys_addr_t cln) /* If we overflowed the overlap counter then we're potentially * leaking dma-mappings. */ - WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, + WARN_ONCE(!is_cache_clean && overlap > ACTIVE_CACHELINE_MAX_OVERLAP, pr_fmt("exceeded %d overlapping mappings of cacheline %pa\n"), ACTIVE_CACHELINE_MAX_OVERLAP, &cln); } @@ -495,7 +495,7 @@ static int active_cacheline_insert(struct dma_debug_entry *entry, if (rc == -EEXIST) { struct dma_debug_entry *existing; - active_cacheline_inc_overlap(cln); + active_cacheline_inc_overlap(cln, entry->is_cache_clean); existing = radix_tree_lookup(&dma_active_cacheline, cln); /* A lookup failure here after we got -EEXIST is unexpected. */ WARN_ON(!existing); @@ -601,7 +601,8 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) unsigned long flags; int rc; - entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN); + entry->is_cache_clean = attrs & (DMA_ATTR_DEBUGGING_IGNORE_CACHELINES | + DMA_ATTR_REQUIRE_COHERENT); bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); @@ -614,6 +615,7 @@ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !(entry->is_cache_clean && overlap_cache_clean) && + dma_get_cache_alignment() >= L1_CACHE_BYTES && !(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && is_swiotlb_active(entry->dev))) { err_printk(entry->dev, entry, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8f43a930716d..ec887f443741 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -406,6 +406,8 @@ void dma_direct_sync_sg_for_device(struct device *dev, arch_sync_dma_for_device(paddr, sg->length, dir); } + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); } #endif @@ -427,8 +429,10 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir); } - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { + arch_sync_dma_flush(); arch_sync_dma_for_cpu_all(); + } } /* @@ -440,14 +444,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, { struct scatterlist *sg; int i; + bool need_sync = false; for_each_sg(sgl, sg, nents, i) { - if (sg_dma_is_bus_address(sg)) + if (sg_dma_is_bus_address(sg)) { sg_dma_unmark_bus_address(sg); - else + } else { + need_sync = true; dma_direct_unmap_phys(dev, sg->dma_address, - sg_dma_len(sg), dir, attrs); + sg_dma_len(sg), dir, attrs, false); + } } + if (need_sync && !dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); } #endif @@ -457,6 +466,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, struct pci_p2pdma_map_state p2pdma_state = {}; struct scatterlist *sg; int i, ret; + bool need_sync = false; for_each_sg(sgl, sg, nents, i) { switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) { @@ -468,8 +478,9 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, */ break; case PCI_P2PDMA_MAP_NONE: + need_sync = true; sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg), - sg->length, dir, attrs); + sg->length, dir, attrs, false); if (sg->dma_address == DMA_MAPPING_ERROR) { ret = -EIO; goto out_unmap; @@ -488,6 +499,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, sg_dma_len(sg) = sg->length; } + if (need_sync && !dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); return nents; out_unmap: diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index f476c63b668c..7140c208c123 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -60,17 +60,22 @@ static inline void dma_direct_sync_single_for_device(struct device *dev, swiotlb_sync_single_for_device(dev, paddr, size, dir); - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_device(paddr, size, dir); + arch_sync_dma_flush(); + } } static inline void dma_direct_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) + dma_addr_t addr, size_t size, enum dma_data_direction dir, + bool flush) { phys_addr_t paddr = dma_to_phys(dev, addr); if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_cpu(paddr, size, dir); + if (flush) + arch_sync_dma_flush(); arch_sync_dma_for_cpu_all(); } @@ -79,26 +84,35 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, static inline dma_addr_t dma_direct_map_phys(struct device *dev, phys_addr_t phys, size_t size, enum dma_data_direction dir, - unsigned long attrs) + unsigned long attrs, bool flush) { dma_addr_t dma_addr; if (is_swiotlb_force_bounce(dev)) { - if (attrs & DMA_ATTR_MMIO) - goto err_overflow; + if (!(attrs & DMA_ATTR_CC_SHARED)) { + if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)) + return DMA_MAPPING_ERROR; - return swiotlb_map(dev, phys, size, dir, attrs); + return swiotlb_map(dev, phys, size, dir, attrs); + } + } else if (attrs & DMA_ATTR_CC_SHARED) { + return DMA_MAPPING_ERROR; } if (attrs & DMA_ATTR_MMIO) { dma_addr = phys; if (unlikely(!dma_capable(dev, dma_addr, size, false))) goto err_overflow; + } else if (attrs & DMA_ATTR_CC_SHARED) { + dma_addr = phys_to_dma_unencrypted(dev, phys); + if (unlikely(!dma_capable(dev, dma_addr, size, false))) + goto err_overflow; } else { dma_addr = phys_to_dma(dev, phys); if (unlikely(!dma_capable(dev, dma_addr, size, true)) || dma_kmalloc_needs_bounce(dev, size, dir)) { - if (is_swiotlb_active(dev)) + if (is_swiotlb_active(dev) && + !(attrs & DMA_ATTR_REQUIRE_COHERENT)) return swiotlb_map(dev, phys, size, dir, attrs); goto err_overflow; @@ -106,8 +120,11 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev, } if (!dev_is_dma_coherent(dev) && - !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) { arch_sync_dma_for_device(phys, size, dir); + if (flush) + arch_sync_dma_flush(); + } return dma_addr; err_overflow: @@ -119,17 +136,18 @@ err_overflow: } static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) + size_t size, enum dma_data_direction dir, unsigned long attrs, + bool flush) { phys_addr_t phys; - if (attrs & DMA_ATTR_MMIO) + if (attrs & (DMA_ATTR_MMIO | DMA_ATTR_REQUIRE_COHERENT)) /* nothing to do: uncached and no swiotlb */ return; phys = dma_to_phys(dev, addr); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_direct_sync_single_for_cpu(dev, addr, size, dir); + dma_direct_sync_single_for_cpu(dev, addr, size, dir, flush); swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 0f33b3ea7daf..29eeb5fdf199 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -5,6 +5,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/cleanup.h> #include <linux/debugfs.h> #include <linux/delay.h> #include <linux/device.h> @@ -15,6 +16,7 @@ #include <linux/module.h> #include <linux/pci.h> #include <linux/platform_device.h> +#include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/timekeeping.h> #include <uapi/linux/map_benchmark.h> @@ -31,17 +33,219 @@ struct map_benchmark_data { atomic64_t loops; }; +struct map_benchmark_ops { + void *(*prepare)(struct map_benchmark_data *map); + void (*unprepare)(void *mparam); + void (*initialize_data)(void *mparam); + int (*do_map)(void *mparam); + void (*do_unmap)(void *mparam); +}; + +struct dma_single_map_param { + struct device *dev; + dma_addr_t addr; + void *xbuf; + u32 npages; + u32 dma_dir; +}; + +static void *dma_single_map_benchmark_prepare(struct map_benchmark_data *map) +{ + struct dma_single_map_param *params __free(kfree) = kzalloc(sizeof(*params), + GFP_KERNEL); + if (!params) + return NULL; + + params->npages = map->bparam.granule; + params->dma_dir = map->bparam.dma_dir; + params->dev = map->dev; + params->xbuf = alloc_pages_exact(params->npages * PAGE_SIZE, GFP_KERNEL); + if (!params->xbuf) + return NULL; + + return_ptr(params); +} + +static void dma_single_map_benchmark_unprepare(void *mparam) +{ + struct dma_single_map_param *params = mparam; + + free_pages_exact(params->xbuf, params->npages * PAGE_SIZE); + kfree(params); +} + +static void dma_single_map_benchmark_initialize_data(void *mparam) +{ + struct dma_single_map_param *params = mparam; + + /* + * for a non-coherent device, if we don't stain them in the + * cache, this will give an underestimate of the real-world + * overhead of BIDIRECTIONAL or TO_DEVICE mappings; + * 66 means everything goes well! 66 is lucky. + */ + if (params->dma_dir != DMA_FROM_DEVICE) + memset(params->xbuf, 0x66, params->npages * PAGE_SIZE); +} + +static int dma_single_map_benchmark_do_map(void *mparam) +{ + struct dma_single_map_param *params = mparam; + + params->addr = dma_map_single(params->dev, params->xbuf, + params->npages * PAGE_SIZE, params->dma_dir); + if (unlikely(dma_mapping_error(params->dev, params->addr))) { + pr_err("dma_map_single failed on %s\n", dev_name(params->dev)); + return -ENOMEM; + } + + return 0; +} + +static void dma_single_map_benchmark_do_unmap(void *mparam) +{ + struct dma_single_map_param *params = mparam; + + dma_unmap_single(params->dev, params->addr, + params->npages * PAGE_SIZE, params->dma_dir); +} + +static struct map_benchmark_ops dma_single_map_benchmark_ops = { + .prepare = dma_single_map_benchmark_prepare, + .unprepare = dma_single_map_benchmark_unprepare, + .initialize_data = dma_single_map_benchmark_initialize_data, + .do_map = dma_single_map_benchmark_do_map, + .do_unmap = dma_single_map_benchmark_do_unmap, +}; + +struct dma_sg_map_param { + struct sg_table sgt; + struct device *dev; + void **buf; + u32 npages; + u32 dma_dir; +}; + +static void *dma_sg_map_benchmark_prepare(struct map_benchmark_data *map) +{ + struct scatterlist *sg; + int i; + + struct dma_sg_map_param *params = kzalloc(sizeof(*params), GFP_KERNEL); + + if (!params) + return NULL; + /* + * Set the number of scatterlist entries based on the granule. + * In SG mode, 'granule' represents the number of scatterlist entries. + * Each scatterlist entry corresponds to a single page. + */ + params->npages = map->bparam.granule; + params->dma_dir = map->bparam.dma_dir; + params->dev = map->dev; + params->buf = kmalloc_array(params->npages, sizeof(*params->buf), + GFP_KERNEL); + if (!params->buf) + goto out; + + if (sg_alloc_table(¶ms->sgt, params->npages, GFP_KERNEL)) + goto free_buf; + + for_each_sgtable_sg(¶ms->sgt, sg, i) { + params->buf[i] = (void *)__get_free_page(GFP_KERNEL); + if (!params->buf[i]) + goto free_page; + + sg_set_buf(sg, params->buf[i], PAGE_SIZE); + } + + return params; + +free_page: + while (i-- > 0) + free_page((unsigned long)params->buf[i]); + + sg_free_table(¶ms->sgt); +free_buf: + kfree(params->buf); +out: + kfree(params); + return NULL; +} + +static void dma_sg_map_benchmark_unprepare(void *mparam) +{ + struct dma_sg_map_param *params = mparam; + int i; + + for (i = 0; i < params->npages; i++) + free_page((unsigned long)params->buf[i]); + + sg_free_table(¶ms->sgt); + + kfree(params->buf); + kfree(params); +} + +static void dma_sg_map_benchmark_initialize_data(void *mparam) +{ + struct dma_sg_map_param *params = mparam; + struct scatterlist *sg; + int i = 0; + + if (params->dma_dir == DMA_FROM_DEVICE) + return; + + for_each_sgtable_sg(¶ms->sgt, sg, i) + memset(params->buf[i], 0x66, PAGE_SIZE); +} + +static int dma_sg_map_benchmark_do_map(void *mparam) +{ + struct dma_sg_map_param *params = mparam; + int ret = 0; + + int sg_mapped = dma_map_sg(params->dev, params->sgt.sgl, + params->npages, params->dma_dir); + if (!sg_mapped) { + pr_err("dma_map_sg failed on %s\n", dev_name(params->dev)); + ret = -ENOMEM; + } + + return ret; +} + +static void dma_sg_map_benchmark_do_unmap(void *mparam) +{ + struct dma_sg_map_param *params = mparam; + + dma_unmap_sg(params->dev, params->sgt.sgl, params->npages, + params->dma_dir); +} + +static struct map_benchmark_ops dma_sg_map_benchmark_ops = { + .prepare = dma_sg_map_benchmark_prepare, + .unprepare = dma_sg_map_benchmark_unprepare, + .initialize_data = dma_sg_map_benchmark_initialize_data, + .do_map = dma_sg_map_benchmark_do_map, + .do_unmap = dma_sg_map_benchmark_do_unmap, +}; + +static struct map_benchmark_ops *dma_map_benchmark_ops[DMA_MAP_BENCH_MODE_MAX] = { + [DMA_MAP_BENCH_SINGLE_MODE] = &dma_single_map_benchmark_ops, + [DMA_MAP_BENCH_SG_MODE] = &dma_sg_map_benchmark_ops, +}; + static int map_benchmark_thread(void *data) { - void *buf; - dma_addr_t dma_addr; struct map_benchmark_data *map = data; - int npages = map->bparam.granule; - u64 size = npages * PAGE_SIZE; + __u8 map_mode = map->bparam.map_mode; int ret = 0; - buf = alloc_pages_exact(size, GFP_KERNEL); - if (!buf) + struct map_benchmark_ops *mb_ops = dma_map_benchmark_ops[map_mode]; + void *mparam = mb_ops->prepare(map); + + if (!mparam) return -ENOMEM; while (!kthread_should_stop()) { @@ -49,23 +253,12 @@ static int map_benchmark_thread(void *data) ktime_t map_stime, map_etime, unmap_stime, unmap_etime; ktime_t map_delta, unmap_delta; - /* - * for a non-coherent device, if we don't stain them in the - * cache, this will give an underestimate of the real-world - * overhead of BIDIRECTIONAL or TO_DEVICE mappings; - * 66 means evertything goes well! 66 is lucky. - */ - if (map->dir != DMA_FROM_DEVICE) - memset(buf, 0x66, size); - + mb_ops->initialize_data(mparam); map_stime = ktime_get(); - dma_addr = dma_map_single(map->dev, buf, size, map->dir); - if (unlikely(dma_mapping_error(map->dev, dma_addr))) { - pr_err("dma_map_single failed on %s\n", - dev_name(map->dev)); - ret = -ENOMEM; + ret = mb_ops->do_map(mparam); + if (ret) goto out; - } + map_etime = ktime_get(); map_delta = ktime_sub(map_etime, map_stime); @@ -73,7 +266,8 @@ static int map_benchmark_thread(void *data) ndelay(map->bparam.dma_trans_ns); unmap_stime = ktime_get(); - dma_unmap_single(map->dev, dma_addr, size, map->dir); + mb_ops->do_unmap(mparam); + unmap_etime = ktime_get(); unmap_delta = ktime_sub(unmap_etime, unmap_stime); @@ -108,7 +302,7 @@ static int map_benchmark_thread(void *data) } out: - free_pages_exact(buf, size); + mb_ops->unprepare(mparam); return ret; } @@ -209,6 +403,12 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case DMA_MAP_BENCHMARK: + if (map->bparam.map_mode < 0 || + map->bparam.map_mode >= DMA_MAP_BENCH_MODE_MAX) { + pr_err("invalid map mode\n"); + return -EINVAL; + } + if (map->bparam.threads == 0 || map->bparam.threads > DMA_MAP_MAX_THREADS) { pr_err("invalid thread number\n"); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 3928a509c44c..23ed8eb9233e 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -157,6 +157,7 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); bool is_mmio = attrs & DMA_ATTR_MMIO; + bool is_cc_shared = attrs & DMA_ATTR_CC_SHARED; dma_addr_t addr = DMA_MAPPING_ERROR; BUG_ON(!valid_dma_direction(dir)); @@ -164,9 +165,15 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, if (WARN_ON_ONCE(!dev->dma_mask)) return DMA_MAPPING_ERROR; + if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT)) + return DMA_MAPPING_ERROR; + if (dma_map_direct(dev, ops) || - (!is_mmio && arch_dma_map_phys_direct(dev, phys + size))) - addr = dma_direct_map_phys(dev, phys, size, dir, attrs); + (!is_mmio && !is_cc_shared && + arch_dma_map_phys_direct(dev, phys + size))) + addr = dma_direct_map_phys(dev, phys, size, dir, attrs, true); + else if (is_cc_shared) + return DMA_MAPPING_ERROR; else if (use_dma_iommu(dev)) addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else if (ops->map_phys) @@ -203,11 +210,16 @@ void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); bool is_mmio = attrs & DMA_ATTR_MMIO; + bool is_cc_shared = attrs & DMA_ATTR_CC_SHARED; BUG_ON(!valid_dma_direction(dir)); + if (dma_map_direct(dev, ops) || - (!is_mmio && arch_dma_unmap_phys_direct(dev, addr + size))) - dma_direct_unmap_phys(dev, addr, size, dir, attrs); + (!is_mmio && !is_cc_shared && + arch_dma_unmap_phys_direct(dev, addr + size))) + dma_direct_unmap_phys(dev, addr, size, dir, attrs, true); + else if (is_cc_shared) + return; else if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, addr, size, dir, attrs); else if (ops->unmap_phys) @@ -235,6 +247,9 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, BUG_ON(!valid_dma_direction(dir)); + if (!dev_is_dma_coherent(dev) && (attrs & DMA_ATTR_REQUIRE_COHERENT)) + return -EOPNOTSUPP; + if (WARN_ON_ONCE(!dev->dma_mask)) return 0; @@ -373,7 +388,7 @@ void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) - dma_direct_sync_single_for_cpu(dev, addr, size, dir); + dma_direct_sync_single_for_cpu(dev, addr, size, dir, true); else if (use_dma_iommu(dev)) iommu_dma_sync_single_for_cpu(dev, addr, size, dir); else if (ops->sync_single_for_cpu) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d8e6f1d889d5..9a15e7231e39 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -30,6 +30,7 @@ #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/io.h> +#include <linux/kmsan-checks.h> #include <linux/iommu-helper.h> #include <linux/init.h> #include <linux/memblock.h> @@ -867,6 +868,9 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size if (orig_addr == INVALID_PHYS_ADDR) return; + if (dir == DMA_FROM_DEVICE && !dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); + /* * It's valid for tlb_offset to be negative. This can happen when the * "offset" returned by swiotlb_align_offset() is non-zero, and the @@ -901,10 +905,19 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size local_irq_save(flags); page = pfn_to_page(pfn); - if (dir == DMA_TO_DEVICE) + if (dir == DMA_TO_DEVICE) { + /* + * Ideally, kmsan_check_highmem_page() + * could be used here to detect infoleaks, + * but callers may map uninitialized buffers + * that will be written by the device, + * causing false positives. + */ memcpy_from_page(vaddr, page, offset, sz); - else + } else { + kmsan_unpoison_memory(vaddr, sz); memcpy_to_page(page, offset, vaddr, sz); + } local_irq_restore(flags); size -= sz; @@ -913,8 +926,15 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size offset = 0; } } else if (dir == DMA_TO_DEVICE) { + /* + * Ideally, kmsan_check_memory() could be used here to detect + * infoleaks (uninitialized data being sent to device), but + * callers may map uninitialized buffers that will be written + * by the device, causing false positives. + */ memcpy(vaddr, phys_to_virt(orig_addr), size); } else { + kmsan_unpoison_memory(vaddr, size); memcpy(phys_to_virt(orig_addr), vaddr, size); } } @@ -1595,8 +1615,10 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, return DMA_MAPPING_ERROR; } - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { arch_sync_dma_for_device(swiotlb_addr, size, dir); + arch_sync_dma_flush(); + } return dma_addr; } @@ -1855,26 +1877,25 @@ static void rmem_swiotlb_device_release(struct reserved_mem *rmem, dev->dma_io_tlb_mem = &io_tlb_default_mem; } -static const struct reserved_mem_ops rmem_swiotlb_ops = { - .device_init = rmem_swiotlb_device_init, - .device_release = rmem_swiotlb_device_release, -}; - -static int __init rmem_swiotlb_setup(struct reserved_mem *rmem) +static int __init rmem_swiotlb_setup(unsigned long node, + struct reserved_mem *rmem) { - unsigned long node = rmem->fdt_node; - if (of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "linux,cma-default", NULL) || of_get_flat_dt_prop(node, "linux,dma-default", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; - rmem->ops = &rmem_swiotlb_ops; pr_info("Reserved memory: created restricted DMA pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); return 0; } -RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup); +static const struct reserved_mem_ops rmem_swiotlb_ops = { + .node_init = rmem_swiotlb_setup, + .device_init = rmem_swiotlb_device_init, + .device_release = rmem_swiotlb_device_release, +}; + +RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", &rmem_swiotlb_ops); #endif /* CONFIG_DMA_RESTRICTED_POOL */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 9ef63e414791..19d2244a9fef 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -47,10 +47,10 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re */ while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { - local_irq_enable_exit_to_user(ti_work); + local_irq_enable(); if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) { - if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY)) + if (!rseq_grant_slice_extension(ti_work, TIF_SLICE_EXT_DENY)) schedule(); } @@ -74,7 +74,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re * might have changed while interrupts and preemption was * enabled above. */ - local_irq_disable_exit_to_user(); + local_irq_disable(); /* Check if any of the above work has queued a deferred wakeup */ tick_nohz_user_enter_prepare(); @@ -105,70 +105,16 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) { - irqentry_state_t ret = { - .exit_rcu = false, - }; - if (user_mode(regs)) { - irqentry_enter_from_user_mode(regs); - return ret; - } + irqentry_state_t ret = { + .exit_rcu = false, + }; - /* - * If this entry hit the idle task invoke ct_irq_enter() whether - * RCU is watching or not. - * - * Interrupts can nest when the first interrupt invokes softirq - * processing on return which enables interrupts. - * - * Scheduler ticks in the idle task can mark quiescent state and - * terminate a grace period, if and only if the timer interrupt is - * not nested into another interrupt. - * - * Checking for rcu_is_watching() here would prevent the nesting - * interrupt to invoke ct_irq_enter(). If that nested interrupt is - * the tick then rcu_flavor_sched_clock_irq() would wrongfully - * assume that it is the first interrupt and eventually claim - * quiescent state and end grace periods prematurely. - * - * Unconditionally invoke ct_irq_enter() so RCU state stays - * consistent. - * - * TINY_RCU does not support EQS, so let the compiler eliminate - * this part when enabled. - */ - if (!IS_ENABLED(CONFIG_TINY_RCU) && - (is_idle_task(current) || arch_in_rcu_eqs())) { - /* - * If RCU is not watching then the same careful - * sequence vs. lockdep and tracing is required - * as in irqentry_enter_from_user_mode(). - */ - lockdep_hardirqs_off(CALLER_ADDR0); - ct_irq_enter(); - instrumentation_begin(); - kmsan_unpoison_entry_regs(regs); - trace_hardirqs_off_finish(); - instrumentation_end(); - - ret.exit_rcu = true; + irqentry_enter_from_user_mode(regs); return ret; } - /* - * If RCU is watching then RCU only wants to check whether it needs - * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() - * already contains a warning when RCU is not watching, so no point - * in having another one here. - */ - lockdep_hardirqs_off(CALLER_ADDR0); - instrumentation_begin(); - kmsan_unpoison_entry_regs(regs); - rcu_irq_enter_check_tick(); - trace_hardirqs_off_finish(); - instrumentation_end(); - - return ret; + return irqentry_enter_from_kernel_mode(regs); } /** @@ -212,43 +158,10 @@ void dynamic_irqentry_exit_cond_resched(void) noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) { - lockdep_assert_irqs_disabled(); - - /* Check whether this returns to user mode */ - if (user_mode(regs)) { + if (user_mode(regs)) irqentry_exit_to_user_mode(regs); - } else if (!regs_irqs_disabled(regs)) { - /* - * If RCU was not watching on entry this needs to be done - * carefully and needs the same ordering of lockdep/tracing - * and RCU as the return to user mode path. - */ - if (state.exit_rcu) { - instrumentation_begin(); - /* Tell the tracer that IRET will enable interrupts */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - instrumentation_end(); - ct_irq_exit(); - lockdep_hardirqs_on(CALLER_ADDR0); - return; - } - - instrumentation_begin(); - if (IS_ENABLED(CONFIG_PREEMPTION)) - irqentry_exit_cond_resched(); - - /* Covers both tracing and lockdep */ - trace_hardirqs_on(); - instrumentation_end(); - } else { - /* - * IRQ flags state is correct already. Just tell RCU if it - * was not watching on entry. - */ - if (state.exit_rcu) - ct_irq_exit(); - } + else + irqentry_exit_to_kernel_mode(regs, state); } irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) diff --git a/kernel/events/core.c b/kernel/events/core.c index ac70d68217b6..6d1f8bad7e1c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4138,7 +4138,8 @@ static int merge_sched_in(struct perf_event *event, void *data) if (*perf_event_fasync(event)) event->pending_kill = POLL_ERR; - perf_event_wakeup(event); + event->pending_wakeup = 1; + irq_work_queue(&event->pending_irq); } else { struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); @@ -4812,7 +4813,7 @@ static void __perf_event_read(void *info) struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); - struct pmu *pmu = event->pmu; + struct pmu *pmu; /* * If this is a task context, we need to check whether it is @@ -4824,7 +4825,7 @@ static void __perf_event_read(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - raw_spin_lock(&ctx->lock); + guard(raw_spinlock)(&ctx->lock); ctx_time_update_event(ctx, event); perf_event_update_time(event); @@ -4832,25 +4833,22 @@ static void __perf_event_read(void *info) perf_event_update_sibling_time(event); if (event->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; + return; if (!data->group) { - pmu->read(event); + perf_pmu_read(event); data->ret = 0; - goto unlock; + return; } + pmu = event->pmu_ctx->pmu; pmu->start_txn(pmu, PERF_PMU_TXN_READ); - pmu->read(event); - + perf_pmu_read(event); for_each_sibling_event(sub, event) perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); - -unlock: - raw_spin_unlock(&ctx->lock); } static inline u64 perf_event_count(struct perf_event *event, bool self) @@ -5370,15 +5368,15 @@ static void unaccount_freq_event(void) static struct perf_ctx_data * -alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global) +alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global, gfp_t gfp_flags) { struct perf_ctx_data *cd; - cd = kzalloc_obj(*cd); + cd = kzalloc_obj(*cd, gfp_flags); if (!cd) return NULL; - cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL); + cd->data = kmem_cache_zalloc(ctx_cache, gfp_flags); if (!cd->data) { kfree(cd); return NULL; @@ -5412,11 +5410,11 @@ static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd) static int attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache, - bool global) + bool global, gfp_t gfp_flags) { struct perf_ctx_data *cd, *old = NULL; - cd = alloc_perf_ctx_data(ctx_cache, global); + cd = alloc_perf_ctx_data(ctx_cache, global, gfp_flags); if (!cd) return -ENOMEM; @@ -5489,6 +5487,12 @@ again: cd = NULL; } if (!cd) { + /* + * Try to allocate context quickly before + * traversing the whole thread list again. + */ + if (!attach_task_ctx_data(p, ctx_cache, true, GFP_NOWAIT)) + continue; get_task_struct(p); goto alloc; } @@ -5499,7 +5503,7 @@ again: return 0; alloc: - ret = attach_task_ctx_data(p, ctx_cache, true); + ret = attach_task_ctx_data(p, ctx_cache, true, GFP_KERNEL); put_task_struct(p); if (ret) { __detach_global_ctx_data(); @@ -5519,7 +5523,7 @@ attach_perf_ctx_data(struct perf_event *event) return -ENOMEM; if (task) - return attach_task_ctx_data(task, ctx_cache, false); + return attach_task_ctx_data(task, ctx_cache, false, GFP_KERNEL); ret = attach_global_ctx_data(ctx_cache); if (ret) @@ -5554,22 +5558,15 @@ static void __detach_global_ctx_data(void) struct task_struct *g, *p; struct perf_ctx_data *cd; -again: scoped_guard (rcu) { for_each_process_thread(g, p) { cd = rcu_dereference(p->perf_ctx_data); - if (!cd || !cd->global) - continue; - cd->global = 0; - get_task_struct(p); - goto detach; + if (cd && cd->global) { + cd->global = 0; + detach_task_ctx_data(p); + } } } - return; -detach: - detach_task_ctx_data(p); - put_task_struct(p); - goto again; } static void detach_global_ctx_data(void) @@ -7215,7 +7212,7 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) #ifdef CONFIG_MMU /* Clear any partial mappings on error. */ if (err) - zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); + zap_vma_range(vma, vma->vm_start, nr_pages * PAGE_SIZE); #endif return err; @@ -7464,28 +7461,28 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = perf_mmap_aux(vma, event, nr_pages); if (ret) return ret; - } - /* - * Since pinned accounting is per vm we cannot allow fork() to copy our - * vma. - */ - vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); - vma->vm_ops = &perf_mmap_vmops; + /* + * Since pinned accounting is per vm we cannot allow fork() to copy our + * vma. + */ + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); + vma->vm_ops = &perf_mmap_vmops; - mapped = get_mapped(event, event_mapped); - if (mapped) - mapped(event, vma->vm_mm); + mapped = get_mapped(event, event_mapped); + if (mapped) + mapped(event, vma->vm_mm); - /* - * Try to map it into the page table. On fail, invoke - * perf_mmap_close() to undo the above, as the callsite expects - * full cleanup in this case and therefore does not invoke - * vmops::close(). - */ - ret = map_range(event->rb, vma); - if (ret) - perf_mmap_close(vma); + /* + * Try to map it into the page table. On fail, invoke + * perf_mmap_close() to undo the above, as the callsite expects + * full cleanup in this case and therefore does not invoke + * vmops::close(). + */ + ret = map_range(event->rb, vma); + if (ret) + perf_mmap_close(vma); + } return ret; } @@ -8422,7 +8419,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) pte_t *ptep, pte; pgdp = pgd_offset(mm, addr); - pgd = READ_ONCE(*pgdp); + pgd = pgdp_get(pgdp); if (pgd_none(pgd)) return 0; @@ -8430,7 +8427,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) return pgd_leaf_size(pgd); p4dp = p4d_offset_lockless(pgdp, pgd, addr); - p4d = READ_ONCE(*p4dp); + p4d = p4dp_get(p4dp); if (!p4d_present(p4d)) return 0; @@ -8438,7 +8435,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) return p4d_leaf_size(p4d); pudp = pud_offset_lockless(p4dp, p4d, addr); - pud = READ_ONCE(*pudp); + pud = pudp_get(pudp); if (!pud_present(pud)) return 0; @@ -9240,7 +9237,7 @@ perf_event_alloc_task_data(struct task_struct *child, return; attach: - attach_task_ctx_data(child, ctx_cache, true); + attach_task_ctx_data(child, ctx_cache, true, GFP_KERNEL); } void perf_event_fork(struct task_struct *task) @@ -10776,6 +10773,13 @@ int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { + /* + * Entry point from hardware PMI, interrupts should be disabled here. + * This serializes us against perf_event_remove_from_context() in + * things like perf_event_release_kernel(). + */ + lockdep_assert_irqs_disabled(); + return __perf_event_overflow(event, 1, data, regs); } @@ -10852,6 +10856,19 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, { struct hw_perf_event *hwc = &event->hw; + /* + * This is: + * - software preempt + * - tracepoint preempt + * - tp_target_task irq (ctx->lock) + * - uprobes preempt/irq + * - kprobes preempt/irq + * - hw_breakpoint irq + * + * Any of these are sufficient to hold off RCU and thus ensure @event + * exists. + */ + lockdep_assert_preemption_disabled(); local64_add(nr, &event->count); if (!regs) @@ -10860,6 +10877,16 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, if (!is_sampling_event(event)) return; + /* + * Serialize against event_function_call() IPIs like normal overflow + * event handling. Specifically, must not allow + * perf_event_release_kernel() -> perf_remove_from_context() to make + * progress and 'release' the event from under us. + */ + guard(irqsave)(); + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { data->period = nr; return perf_swevent_overflow(event, 1, data, regs); @@ -11358,6 +11385,11 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct perf_sample_data data; struct perf_event *event; + /* + * Per being a tracepoint, this runs with preemption disabled. + */ + lockdep_assert_preemption_disabled(); + struct perf_raw_record raw = { .frag = { .size = entry_size, @@ -11690,6 +11722,11 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; + /* + * Exception context, will have interrupts disabled. + */ + lockdep_assert_irqs_disabled(); + perf_sample_data_init(&sample, bp->attr.bp_addr, 0); if (!bp->hw.state && !perf_exclude_event(bp, regs)) @@ -12154,7 +12191,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && is_idle_task(current))) - if (__perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) ret = HRTIMER_NORESTART; } @@ -14703,7 +14740,7 @@ inherit_event(struct perf_event *parent_event, get_ctx(child_ctx); child_event->ctx = child_ctx; - pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); + pmu_ctx = find_get_pmu_context(parent_event->pmu_ctx->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); return ERR_CAST(pmu_ctx); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 923b24b321cc..4084e926e284 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -344,7 +344,7 @@ out: static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { - pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " + pr_warn("ref_ctr %s failed for inode: 0x%llx offset: " "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, @@ -982,7 +982,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) static void ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) { - pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " + pr_warn("ref_ctr_offset mismatch. inode: 0x%llx offset: 0x%llx " "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) cur_uprobe->ref_ctr_offset, diff --git a/kernel/exit.c b/kernel/exit.c index 8a87021211ae..25e9cb6de7e7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -608,7 +608,8 @@ static struct task_struct *find_child_reaper(struct task_struct *father, reaper = find_alive_thread(father); if (reaper) { - pid_ns->child_reaper = reaper; + ASSERT_EXCLUSIVE_WRITER(pid_ns->child_reaper); + WRITE_ONCE(pid_ns->child_reaper, reaper); return reaper; } @@ -748,14 +749,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { - int sig = thread_group_leader(tsk) && - thread_group_empty(tsk) && - !ptrace_reparented(tsk) ? - tsk->exit_signal : SIGCHLD; + int sig = thread_group_empty(tsk) && !ptrace_reparented(tsk) + ? tsk->exit_signal : SIGCHLD; autoreap = do_notify_parent(tsk, sig); } else if (thread_group_leader(tsk)) { autoreap = thread_group_empty(tsk) && - do_notify_parent(tsk, tsk->exit_signal); + do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; /* untraced sub-thread */ @@ -896,11 +895,16 @@ static void synchronize_group_exit(struct task_struct *tsk, long code) void __noreturn do_exit(long code) { struct task_struct *tsk = current; + struct kthread *kthread; int group_dead; WARN_ON(irqs_disabled()); WARN_ON(tsk->plug); + kthread = tsk_is_kthread(tsk); + if (unlikely(kthread)) + kthread_do_exit(kthread, code); + kcov_task_exit(tsk); kmsan_task_exit(tsk); @@ -1013,6 +1017,7 @@ void __noreturn do_exit(long code) lockdep_free_task(tsk); do_task_dead(); } +EXPORT_SYMBOL(do_exit); void __noreturn make_task_dead(int signr) { diff --git a/kernel/fork.c b/kernel/fork.c index e832da9d15a4..f1ad69c6dc2d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -46,6 +46,7 @@ #include <linux/mm_inline.h> #include <linux/memblock.h> #include <linux/nsproxy.h> +#include <linux/ns/ns_common_types.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/cgroup.h> @@ -95,6 +96,7 @@ #include <linux/thread_info.h> #include <linux/kstack_erase.h> #include <linux/kasan.h> +#include <linux/randomize_kstack.h> #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/io_uring_types.h> @@ -345,7 +347,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) stack = kasan_reset_tag(vm_area->addr); /* Clear stale pointers from reused stack. */ - memset(stack, 0, THREAD_SIZE); + clear_pages(vm_area->addr, vm_area->nr_pages); tsk->stack_vm_area = vm_area; tsk->stack = stack; @@ -1000,6 +1002,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_SCHED_MM_CID tsk->mm_cid.cid = MM_CID_UNSET; tsk->mm_cid.active = 0; + INIT_HLIST_NODE(&tsk->mm_cid.node); #endif return tsk; @@ -1013,13 +1016,14 @@ free_tsk: __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; +static unsigned long coredump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) { - default_dump_filter = - (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & - MMF_DUMP_FILTER_MASK; + if (kstrtoul(s, 0, &coredump_filter)) + return 0; + coredump_filter <<= MMF_DUMP_FILTER_SHIFT; + coredump_filter &= MMF_DUMP_FILTER_MASK; return 1; } @@ -1105,7 +1109,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { - __mm_flags_overwrite_word(mm, default_dump_filter); + __mm_flags_overwrite_word(mm, coredump_filter); mm->def_flags = 0; } @@ -1586,7 +1590,6 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk) tsk->mm = mm; tsk->active_mm = mm; - sched_mm_cid_fork(tsk); return 0; } @@ -2028,6 +2031,41 @@ __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_AUTOREAP) { + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + if (clone_flags & CLONE_PARENT) + return ERR_PTR(-EINVAL); + if (args->exit_signal) + return ERR_PTR(-EINVAL); + } + + if ((clone_flags & CLONE_PARENT) && current->signal->autoreap) + return ERR_PTR(-EINVAL); + + if (clone_flags & CLONE_NNP) { + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + } + + if (clone_flags & CLONE_PIDFD_AUTOKILL) { + if (!(clone_flags & CLONE_PIDFD)) + return ERR_PTR(-EINVAL); + if (!(clone_flags & CLONE_AUTOREAP)) + return ERR_PTR(-EINVAL); + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + /* + * Without CLONE_NNP the child could escalate privileges + * after being spawned, so require CAP_SYS_ADMIN. + * With CLONE_NNP the child can't gain new privileges, + * so allow unprivileged usage. + */ + if (!(clone_flags & CLONE_NNP) && + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -2076,6 +2114,7 @@ __latent_entropy struct task_struct *copy_process( ftrace_graph_init_task(p); rt_mutex_init_task(p); + raw_spin_lock_init(&p->blocked_lock); lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING @@ -2250,13 +2289,18 @@ __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; + unsigned flags = PIDFD_STALE; + + if (clone_flags & CLONE_THREAD) + flags |= PIDFD_THREAD; + if (clone_flags & CLONE_PIDFD_AUTOKILL) + flags |= PIDFD_AUTOKILL; /* * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD. */ - retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); + retval = pidfd_prepare(pid, flags, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; @@ -2392,7 +2436,11 @@ __latent_entropy struct task_struct *copy_process( rseq_fork(p, clone_flags); - /* Don't start children in a dying pid namespace */ + /* + * If zap_pid_ns_processes() was called after alloc_pid(), the new + * child missed SIGKILL. If current is not in the same namespace, + * we can't rely on fatal_signal_pending() below. + */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; goto bad_fork_core_free; @@ -2412,6 +2460,9 @@ __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + if (clone_flags & CLONE_NNP) + task_set_no_new_privs(p); + init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2423,7 +2474,10 @@ __latent_entropy struct task_struct *copy_process( init_task_pid(p, PIDTYPE_SID, task_session(current)); if (is_child_reaper(pid)) { - ns_of_pid(pid)->child_reaper = p; + struct pid_namespace *ns = ns_of_pid(pid); + + ASSERT_EXCLUSIVE_WRITER(ns->child_reaper); + WRITE_ONCE(ns->child_reaper, p); p->signal->flags |= SIGNAL_UNKILLABLE; } p->signal->shared_pending.signal = delayed.signal; @@ -2435,6 +2489,8 @@ __latent_entropy struct task_struct *copy_process( */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; + if (clone_flags & CLONE_AUTOREAP) + p->signal->autoreap = 1; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); @@ -2463,8 +2519,12 @@ __latent_entropy struct task_struct *copy_process( fd_install(pidfd, pidfile); proc_fork_connector(p); - sched_post_fork(p); + /* + * sched_ext needs @p to be associated with its cgroup in its post_fork + * hook. cgroup_post_fork() should come before sched_post_fork(). + */ cgroup_post_fork(p, args); + sched_post_fork(p); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -2498,7 +2558,6 @@ bad_fork_cleanup_namespaces: exit_nsproxy_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { - sched_mm_cid_exit(p); mm_clear_owner(p->mm, p); mmput(p->mm); } @@ -2620,6 +2679,16 @@ pid_t kernel_clone(struct kernel_clone_args *args) pid_t nr; /* + * Creating an empty mount namespace implies creating a new mount + * namespace. Set this before copy_process() so that the + * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly. + */ + if (clone_flags & CLONE_EMPTY_MNTNS) { + clone_flags |= CLONE_NEWNS; + args->flags = clone_flags; + } + + /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate @@ -2897,7 +2966,9 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ if (kargs->flags & - ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | + CLONE_INTO_CGROUP | CLONE_AUTOREAP | CLONE_NNP | + CLONE_PIDFD_AUTOKILL | CLONE_EMPTY_MNTNS)) return false; /* @@ -3046,11 +3117,9 @@ void __init proc_caches_init(void) */ static int check_unshare_flags(unsigned long unshare_flags) { - if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| - CLONE_NEWTIME)) + CLONE_NS_ALL | UNSHARE_EMPTY_MNTNS)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing @@ -3085,7 +3154,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) return 0; /* don't need lock here; in the worst case we'll do useless copy */ - if (fs->users == 1) + if (!(unshare_flags & CLONE_NEWNS) && fs->users == 1) return 0; *new_fsp = copy_fs_struct(fs); @@ -3149,6 +3218,8 @@ int ksys_unshare(unsigned long unshare_flags) /* * If unsharing namespace, must also unshare filesystem information. */ + if (unshare_flags & UNSHARE_EMPTY_MNTNS) + unshare_flags |= CLONE_NEWNS; if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; @@ -3175,11 +3246,10 @@ int ksys_unshare(unsigned long unshare_flags) new_cred, new_fs); if (err) goto bad_unshare_cleanup_cred; - if (new_cred) { err = set_cred_ucounts(new_cred); if (err) - goto bad_unshare_cleanup_cred; + goto bad_unshare_cleanup_nsproxy; } if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { @@ -3195,8 +3265,10 @@ int ksys_unshare(unsigned long unshare_flags) shm_init_task(current); } - if (new_nsproxy) + if (new_nsproxy) { switch_task_namespaces(current, new_nsproxy); + new_nsproxy = NULL; + } task_lock(current); @@ -3225,13 +3297,15 @@ int ksys_unshare(unsigned long unshare_flags) perf_event_namespaces(current); +bad_unshare_cleanup_nsproxy: + if (new_nsproxy) + put_nsproxy(new_nsproxy); bad_unshare_cleanup_cred: if (new_cred) put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); - bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); diff --git a/kernel/futex/Makefile b/kernel/futex/Makefile index b77188d1fa07..dce70f8a322b 100644 --- a/kernel/futex/Makefile +++ b/kernel/futex/Makefile @@ -1,3 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 +CONTEXT_ANALYSIS := y + obj-y += core.o syscalls.o pi.o requeue.o waitwake.o diff --git a/kernel/futex/core.c b/kernel/futex/core.c index cf7e610eac42..ff2a4fb2993f 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -342,7 +342,7 @@ static int __futex_key_to_node(struct mm_struct *mm, unsigned long addr) if (!vma) return FUTEX_NO_NODE; - mpol = vma_policy(vma); + mpol = READ_ONCE(vma->vm_policy); if (!mpol) return FUTEX_NO_NODE; @@ -864,7 +864,6 @@ void __futex_unqueue(struct futex_q *q) /* The key must be already stored in q->key. */ void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) - __acquires(&hb->lock) { /* * Increment the counter before taking the lock so that @@ -879,10 +878,10 @@ void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) q->lock_ptr = &hb->lock; spin_lock(&hb->lock); + __acquire(q->lock_ptr); } void futex_q_unlock(struct futex_hash_bucket *hb) - __releases(&hb->lock) { futex_hb_waiters_dec(hb); spin_unlock(&hb->lock); @@ -1443,12 +1442,15 @@ static void futex_cleanup(struct task_struct *tsk) void futex_exit_recursive(struct task_struct *tsk) { /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ - if (tsk->futex_state == FUTEX_STATE_EXITING) + if (tsk->futex_state == FUTEX_STATE_EXITING) { + __assume_ctx_lock(&tsk->futex_exit_mutex); mutex_unlock(&tsk->futex_exit_mutex); + } tsk->futex_state = FUTEX_STATE_DEAD; } static void futex_cleanup_begin(struct task_struct *tsk) + __acquires(&tsk->futex_exit_mutex) { /* * Prevent various race issues against a concurrent incoming waiter @@ -1475,6 +1477,7 @@ static void futex_cleanup_begin(struct task_struct *tsk) } static void futex_cleanup_end(struct task_struct *tsk, int state) + __releases(&tsk->futex_exit_mutex) { /* * Lockless store. The only side effect is that an observer might diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 30c2afa03889..9f6bf6f585fc 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -217,7 +217,7 @@ enum futex_access { extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw); -extern void futex_q_lockptr_lock(struct futex_q *q); +extern void futex_q_lockptr_lock(struct futex_q *q) __acquires(q->lock_ptr); extern struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns); @@ -311,9 +311,11 @@ extern int futex_unqueue(struct futex_q *q); static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, struct task_struct *task) __releases(&hb->lock) + __releases(q->lock_ptr) { __futex_queue(q, hb, task); spin_unlock(&hb->lock); + __release(q->lock_ptr); } extern void futex_unqueue_pi(struct futex_q *q); @@ -358,9 +360,12 @@ static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb) #endif } -extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb); -extern void futex_q_unlock(struct futex_hash_bucket *hb); +extern void futex_q_lock(struct futex_q *q, struct futex_hash_bucket *hb) + __acquires(&hb->lock) + __acquires(q->lock_ptr); +extern void futex_q_unlock(struct futex_hash_bucket *hb) + __releases(&hb->lock); extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, union futex_key *key, @@ -379,6 +384,9 @@ extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked); */ static inline void double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) + __acquires(&hb1->lock) + __acquires(&hb2->lock) + __no_context_analysis { if (hb1 > hb2) swap(hb1, hb2); @@ -390,6 +398,9 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) static inline void double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) + __releases(&hb1->lock) + __releases(&hb2->lock) + __no_context_analysis { spin_unlock(&hb1->lock); if (hb1 != hb2) diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index bc1f7e83a37e..643199fdbe62 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -389,6 +389,7 @@ static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, * Initialize the pi_mutex in locked state and make @p * the owner of it: */ + __assume_ctx_lock(&pi_state->pi_mutex.wait_lock); rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); /* Store the key for possible exit cleanups: */ @@ -614,6 +615,8 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state, struct rt_mutex_waiter *top_waiter) + __must_hold(&pi_state->pi_mutex.wait_lock) + __releases(&pi_state->pi_mutex.wait_lock) { struct task_struct *new_owner; bool postunlock = false; @@ -670,6 +673,8 @@ out_unlock: static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, struct task_struct *argowner) + __must_hold(&q->pi_state->pi_mutex.wait_lock) + __must_hold(q->lock_ptr) { struct futex_pi_state *pi_state = q->pi_state; struct task_struct *oldowner, *newowner; @@ -918,7 +923,7 @@ int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to; - struct task_struct *exiting = NULL; + struct task_struct *exiting; struct rt_mutex_waiter rt_waiter; struct futex_q q = futex_q_init; DEFINE_WAKE_Q(wake_q); @@ -933,6 +938,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl to = futex_setup_timer(time, &timeout, flags, 0); retry: + exiting = NULL; ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; @@ -966,6 +972,7 @@ retry_private: * - EAGAIN: The user space value changed. */ futex_q_unlock(hb); + __release(q.lock_ptr); /* * Handle the case where the owner is in the middle of * exiting. Wait for the exit to complete otherwise @@ -1090,6 +1097,7 @@ no_block: if (res) ret = (res < 0) ? res : 0; + __release(&hb->lock); futex_unqueue_pi(&q); spin_unlock(q.lock_ptr); if (q.drop_hb_ref) { @@ -1101,10 +1109,12 @@ no_block: out_unlock_put_key: futex_q_unlock(hb); + __release(q.lock_ptr); goto out; uaddr_faulted: futex_q_unlock(hb); + __release(q.lock_ptr); ret = fault_in_user_writeable(uaddr); if (ret) diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 743c7a728237..77ad9691f6a6 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -459,6 +459,14 @@ SYSCALL_DEFINE4(futex_requeue, if (ret) return ret; + /* + * For now mandate both flags are identical, like the sys_futex() + * interface has. If/when we merge the variable sized futex support, + * that patch can modify this test to allow a difference in size. + */ + if (futexes[0].w.flags != futexes[1].w.flags) + return -EINVAL; + cmpval = futexes[0].w.val; return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags, diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 1c2dd03f11ec..ceed9d879059 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -462,6 +462,7 @@ retry: } futex_q_unlock(hb); + __release(q->lock_ptr); } __set_current_state(TASK_RUNNING); @@ -628,6 +629,7 @@ retry_private: if (ret) { futex_q_unlock(hb); + __release(q->lock_ptr); ret = get_user(uval, uaddr); if (ret) @@ -641,11 +643,13 @@ retry_private: if (uval != val) { futex_q_unlock(hb); + __release(q->lock_ptr); return -EWOULDBLOCK; } if (key2 && futex_match(&q->key, key2)) { futex_q_unlock(hb); + __release(q->lock_ptr); return -EINVAL; } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d2254c91450b..6fcc94ce4ca9 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -36,7 +36,7 @@ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; /* * Total number of tasks detected as hung since boot: */ -static unsigned long __read_mostly sysctl_hung_task_detect_count; +static atomic_long_t sysctl_hung_task_detect_count = ATOMIC_LONG_INIT(0); /* * Limit number of tasks checked in a batch. @@ -223,37 +223,36 @@ static inline void debug_show_blocker(struct task_struct *task, unsigned long ti } #endif -static void check_hung_task(struct task_struct *t, unsigned long timeout, - unsigned long prev_detect_count) +/** + * hung_task_info - Print diagnostic details for a hung task + * @t: Pointer to the detected hung task. + * @timeout: Timeout threshold for detecting hung tasks + * @this_round_count: Count of hung tasks detected in the current iteration + * + * Print structured information about the specified hung task, if warnings + * are enabled or if the panic batch threshold is exceeded. + */ +static void hung_task_info(struct task_struct *t, unsigned long timeout, + unsigned long this_round_count) { - unsigned long total_hung_task; - - if (!task_is_hung(t, timeout)) - return; - - /* - * This counter tracks the total number of tasks detected as hung - * since boot. - */ - sysctl_hung_task_detect_count++; - - total_hung_task = sysctl_hung_task_detect_count - prev_detect_count; trace_sched_process_hang(t); - if (sysctl_hung_task_panic && total_hung_task >= sysctl_hung_task_panic) { + if (sysctl_hung_task_panic && this_round_count >= sysctl_hung_task_panic) { console_verbose(); hung_task_call_panic = true; } /* - * Ok, the task did not get scheduled for more than 2 minutes, - * complain: + * The given task did not get scheduled for more than + * CONFIG_DEFAULT_HUNG_TASK_TIMEOUT. Therefore, complain + * accordingly */ if (sysctl_hung_task_warnings || hung_task_call_panic) { if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; - pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", - t->comm, t->pid, (jiffies - t->last_switch_time) / HZ); + pr_err("INFO: task %s:%d blocked%s for more than %ld seconds.\n", + t->comm, t->pid, t->in_iowait ? " in I/O wait" : "", + (jiffies - t->last_switch_time) / HZ); pr_err(" %s %s %.*s\n", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), @@ -297,15 +296,14 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) /* * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for - * a really long time (120 seconds). If that happens, print out - * a warning. + * a really long time. If that happens, print out a warning. */ static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; unsigned long last_break = jiffies; struct task_struct *g, *t; - unsigned long prev_detect_count = sysctl_hung_task_detect_count; + unsigned long this_round_count; int need_warning = sysctl_hung_task_warnings; unsigned long si_mask = hung_task_si_mask; @@ -316,10 +314,9 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; - + this_round_count = 0; rcu_read_lock(); for_each_process_thread(g, t) { - if (!max_count--) goto unlock; if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { @@ -328,12 +325,22 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) last_break = jiffies; } - check_hung_task(t, timeout, prev_detect_count); + if (task_is_hung(t, timeout)) { + /* + * Increment the global counter so that userspace could + * start migrating tasks ASAP. But count the current + * round separately because userspace could reset + * the global counter at any time. + */ + atomic_long_inc(&sysctl_hung_task_detect_count); + this_round_count++; + hung_task_info(t, timeout, this_round_count); + } } unlock: rcu_read_unlock(); - if (!(sysctl_hung_task_detect_count - prev_detect_count)) + if (!this_round_count) return; if (need_warning || hung_task_call_panic) { @@ -358,6 +365,46 @@ static long hung_timeout_jiffies(unsigned long last_checked, } #ifdef CONFIG_SYSCTL + +/** + * proc_dohung_task_detect_count - proc handler for hung_task_detect_count + * @table: Pointer to the struct ctl_table definition for this proc entry + * @dir: Flag indicating the operation + * @buffer: User space buffer for data transfer + * @lenp: Pointer to the length of the data being transferred + * @ppos: Pointer to the current file offset + * + * This handler is used for reading the current hung task detection count + * and for resetting it to zero when a write operation is performed using a + * zero value only. + * Return: 0 on success, or a negative error code on failure. + */ +static int proc_dohung_task_detect_count(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned long detect_count; + struct ctl_table proxy_table; + int err; + + proxy_table = *table; + proxy_table.data = &detect_count; + + if (SYSCTL_KERN_TO_USER(dir)) + detect_count = atomic_long_read(&sysctl_hung_task_detect_count); + + err = proc_doulongvec_minmax(&proxy_table, dir, buffer, lenp, ppos); + if (err < 0) + return err; + + if (SYSCTL_USER_TO_KERN(dir)) { + if (detect_count) + return -EINVAL; + atomic_long_set(&sysctl_hung_task_detect_count, 0); + } + + return 0; +} + /* * Process updating of timeout sysctl */ @@ -438,10 +485,9 @@ static const struct ctl_table hung_task_sysctls[] = { }, { .procname = "hung_task_detect_count", - .data = &sysctl_hung_task_detect_count, .maxlen = sizeof(unsigned long), - .mode = 0444, - .proc_handler = proc_doulongvec_minmax, + .mode = 0644, + .proc_handler = proc_dohung_task_detect_count, }, { .procname = "hung_task_sys_info", diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 85c45cfe7223..78f2418a8925 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -115,13 +115,10 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, if (resv > minvec) return 0; - if (affd->calc_sets) { + if (affd->calc_sets) set_vecs = maxvec - resv; - } else { - cpus_read_lock(); + else set_vecs = cpumask_weight(cpu_possible_mask); - cpus_read_unlock(); - } return resv + min(set_vecs, maxvec - resv); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6147a07d0127..6c9b1dc4e7d4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/irqdomain.h> +#include <linux/random.h> #include <trace/events/irq.h> @@ -929,6 +930,8 @@ void handle_percpu_devid_irq(struct irq_desc *desc) enabled ? " and unmasked" : "", irq, cpu); } + add_interrupt_randomness(irq); + if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); } diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 0f79a4abea05..faafb43a4e61 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -39,7 +39,7 @@ struct irq_matrix { /** * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it - * @matrix_bits: Number of matrix bits must be <= IRQ_MATRIX_BITS + * @matrix_bits: Number of matrix bits * @alloc_start: From which bit the allocation search starts * @alloc_end: At which bit the allocation search ends, i.e first * invalid bit diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 73f7e1fd4ab4..120fd7365fbe 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -79,7 +79,7 @@ void __weak arch_irq_work_raise(void) static __always_inline void irq_work_raise(struct irq_work *work) { if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt()) - trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func); + trace_call__ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func); arch_irq_work_raise(); } diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 7cb19e601426..e851e4b37d0e 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -529,15 +529,6 @@ void __init jump_label_init(void) struct static_key *key = NULL; struct jump_entry *iter; - /* - * Since we are initializing the static_key.enabled field with - * with the 'raw' int values (to avoid pulling in atomic.h) in - * jump_label.h, let's make sure that is safe. There are only two - * cases to check since we initialize to 0 or 1. - */ - BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); - BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); - if (static_key_initialized) return; diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 79e655ea4ca1..ae758150ccb9 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -168,7 +168,7 @@ static bool __report_matches(const struct expect_report *r) if (!report_available()) return false; - expect = kmalloc_obj(observed.lines); + expect = (typeof(expect))kmalloc_obj(observed.lines); if (WARN_ON(!expect)) return false; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 2fea396d29b9..a43d2da0fe3e 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -47,7 +47,6 @@ #include <asm/page.h> #include <asm/sections.h> -#include <crypto/hash.h> #include "kexec_internal.h" atomic_t __kexec_lock = ATOMIC_INIT(0); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ab25b4aa9095..bfc89083daa9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1144,12 +1144,12 @@ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, lockdep_assert_held(&kprobe_mutex); ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0); - if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret)) + if (ret < 0) return ret; if (*cnt == 0) { ret = register_ftrace_function(ops); - if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) { + if (ret < 0) { /* * At this point, sinec ops is not registered, we should be sefe from * registering empty filter. @@ -1178,6 +1178,10 @@ static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int ret; lockdep_assert_held(&kprobe_mutex); + if (unlikely(kprobe_ftrace_disabled)) { + /* Now ftrace is disabled forever, disarm is already done. */ + return 0; + } if (*cnt == 1) { ret = unregister_ftrace_function(ops); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index a9e6354d9e25..f45ade718054 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -8,6 +8,7 @@ #include <asm/byteorder.h> #include <linux/kobject.h> +#include <linux/ksysfs.h> #include <linux/string.h> #include <linux/sysfs.h> #include <linux/export.h> @@ -213,7 +214,7 @@ static const struct attribute_group kernel_attr_group = { .attrs = kernel_attrs, }; -static int __init ksysfs_init(void) +void __init ksysfs_init(void) { int error; @@ -234,14 +235,12 @@ static int __init ksysfs_init(void) goto group_exit; } - return 0; + return; group_exit: sysfs_remove_group(kernel_kobj, &kernel_attr_group); kset_exit: kobject_put(kernel_kobj); exit: - return error; + pr_err("failed to initialize the kernel kobject: %d\n", error); } - -core_initcall(ksysfs_init); diff --git a/kernel/kthread.c b/kernel/kthread.c index 20451b624b67..791210daf8b4 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -85,24 +85,6 @@ static inline struct kthread *to_kthread(struct task_struct *k) return k->worker_private; } -/* - * Variant of to_kthread() that doesn't assume @p is a kthread. - * - * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will - * always remain a kthread. For kthreads p->worker_private always - * points to a struct kthread. For tasks that are not kthreads - * p->worker_private is used to point to other things. - * - * Return NULL for any task that is not a kthread. - */ -static inline struct kthread *__to_kthread(struct task_struct *p) -{ - void *kthread = p->worker_private; - if (kthread && !(p->flags & PF_KTHREAD)) - kthread = NULL; - return kthread; -} - void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); @@ -193,7 +175,7 @@ EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { - struct kthread *kthread = __to_kthread(current); + struct kthread *kthread = tsk_is_kthread(current); if (!kthread) return false; @@ -234,7 +216,7 @@ EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); */ void *kthread_func(struct task_struct *task) { - struct kthread *kthread = __to_kthread(task); + struct kthread *kthread = tsk_is_kthread(task); if (kthread) return kthread->threadfn; return NULL; @@ -266,7 +248,7 @@ EXPORT_SYMBOL_GPL(kthread_data); */ void *kthread_probe_data(struct task_struct *task) { - struct kthread *kthread = __to_kthread(task); + struct kthread *kthread = tsk_is_kthread(task); void *data = NULL; if (kthread) @@ -309,19 +291,8 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); -/** - * kthread_exit - Cause the current kthread return @result to kthread_stop(). - * @result: The integer value to return to kthread_stop(). - * - * While kthread_exit can be called directly, it exists so that - * functions which do some additional work in non-modular code such as - * module_put_and_kthread_exit can be implemented. - * - * Does not return. - */ -void __noreturn kthread_exit(long result) +void kthread_do_exit(struct kthread *kthread, long result) { - struct kthread *kthread = to_kthread(current); kthread->result = result; if (!list_empty(&kthread->affinity_node)) { mutex_lock(&kthread_affinity_lock); @@ -333,9 +304,7 @@ void __noreturn kthread_exit(long result) kthread->preferred_affinity = NULL; } } - do_exit(0); } -EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. @@ -683,7 +652,7 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu) bool kthread_is_per_cpu(struct task_struct *p) { - struct kthread *kthread = __to_kthread(p); + struct kthread *kthread = tsk_is_kthread(p); if (!kthread) return false; diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index cc68a3692905..532f455c5d4f 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -5,6 +5,7 @@ * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport <rppt@kernel.org> * Copyright (C) 2025 Google LLC, Changyuan Lyu <changyuanl@google.com> * Copyright (C) 2025 Pasha Tatashin <pasha.tatashin@soleen.com> + * Copyright (C) 2026 Google LLC, Jason Miu <jasonmiu@google.com> */ #define pr_fmt(fmt) "KHO: " fmt @@ -13,8 +14,10 @@ #include <linux/cma.h> #include <linux/kmemleak.h> #include <linux/count_zeros.h> +#include <linux/kasan.h> #include <linux/kexec.h> #include <linux/kexec_handover.h> +#include <linux/kho_radix_tree.h> #include <linux/kho/abi/kexec_handover.h> #include <linux/libfdt.h> #include <linux/list.h> @@ -64,163 +67,316 @@ static int __init kho_parse_enable(char *p) } early_param("kho", kho_parse_enable); -/* - * Keep track of memory that is to be preserved across KHO. - * - * The serializing side uses two levels of xarrays to manage chunks of per-order - * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order - * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 - * allocations each bitmap will cover 128M of address space. Thus, for 16G of - * memory at most 512K of bitmap memory will be needed for order 0. - * - * This approach is fully incremental, as the serialization progresses folios - * can continue be aggregated to the tracker. The final step, immediately prior - * to kexec would serialize the xarray information into a linked list for the - * successor kernel to parse. - */ - -#define PRESERVE_BITS (PAGE_SIZE * 8) - -struct kho_mem_phys_bits { - DECLARE_BITMAP(preserve, PRESERVE_BITS); -}; - -static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); - -struct kho_mem_phys { - /* - * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized - * to order. - */ - struct xarray phys_bits; -}; - -struct kho_mem_track { - /* Points to kho_mem_phys, each order gets its own bitmap tree */ - struct xarray orders; -}; - -struct khoser_mem_chunk; - struct kho_out { void *fdt; - bool finalized; - struct mutex lock; /* protects KHO FDT finalization */ + struct mutex lock; /* protects KHO FDT */ - struct kho_mem_track track; + struct kho_radix_tree radix_tree; struct kho_debugfs dbg; }; static struct kho_out kho_out = { .lock = __MUTEX_INITIALIZER(kho_out.lock), - .track = { - .orders = XARRAY_INIT(kho_out.track.orders, 0), + .radix_tree = { + .lock = __MUTEX_INITIALIZER(kho_out.radix_tree.lock), }, - .finalized = false, }; -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) +/** + * kho_radix_encode_key - Encodes a physical address and order into a radix key. + * @phys: The physical address of the page. + * @order: The order of the page. + * + * This function combines a page's physical address and its order into a + * single unsigned long, which is used as a key for all radix tree + * operations. + * + * Return: The encoded unsigned long radix key. + */ +static unsigned long kho_radix_encode_key(phys_addr_t phys, unsigned int order) { - void *res = xa_load(xa, index); + /* Order bits part */ + unsigned long h = 1UL << (KHO_ORDER_0_LOG2 - order); + /* Shifted physical address part */ + unsigned long l = phys >> (PAGE_SHIFT + order); - if (res) - return res; + return h | l; +} - void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); +/** + * kho_radix_decode_key - Decodes a radix key back into a physical address and order. + * @key: The unsigned long key to decode. + * @order: An output parameter, a pointer to an unsigned int where the decoded + * page order will be stored. + * + * This function reverses the encoding performed by kho_radix_encode_key(), + * extracting the original physical address and page order from a given key. + * + * Return: The decoded physical address. + */ +static phys_addr_t kho_radix_decode_key(unsigned long key, unsigned int *order) +{ + unsigned int order_bit = fls64(key); + phys_addr_t phys; - if (!elm) - return ERR_PTR(-ENOMEM); + /* order_bit is numbered starting at 1 from fls64 */ + *order = KHO_ORDER_0_LOG2 - order_bit + 1; + /* The order is discarded by the shift */ + phys = key << (PAGE_SHIFT + *order); - if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) - return ERR_PTR(-EINVAL); + return phys; +} - res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); - if (xa_is_err(res)) - return ERR_PTR(xa_err(res)); - else if (res) - return res; +static unsigned long kho_radix_get_bitmap_index(unsigned long key) +{ + return key % (1 << KHO_BITMAP_SIZE_LOG2); +} + +static unsigned long kho_radix_get_table_index(unsigned long key, + unsigned int level) +{ + int s; - return no_free_ptr(elm); + s = ((level - 1) * KHO_TABLE_SIZE_LOG2) + KHO_BITMAP_SIZE_LOG2; + return (key >> s) % (1 << KHO_TABLE_SIZE_LOG2); } -static void __kho_unpreserve_order(struct kho_mem_track *track, unsigned long pfn, - unsigned int order) +/** + * kho_radix_add_page - Marks a page as preserved in the radix tree. + * @tree: The KHO radix tree. + * @pfn: The page frame number of the page to preserve. + * @order: The order of the page. + * + * This function traverses the radix tree based on the key derived from @pfn + * and @order. It sets the corresponding bit in the leaf bitmap to mark the + * page for preservation. If intermediate nodes do not exist along the path, + * they are allocated and added to the tree. + * + * Return: 0 on success, or a negative error code on failure. + */ +int kho_radix_add_page(struct kho_radix_tree *tree, + unsigned long pfn, unsigned int order) { - struct kho_mem_phys_bits *bits; - struct kho_mem_phys *physxa; - const unsigned long pfn_high = pfn >> order; + /* Newly allocated nodes for error cleanup */ + struct kho_radix_node *intermediate_nodes[KHO_TREE_MAX_DEPTH] = { 0 }; + unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order); + struct kho_radix_node *anchor_node = NULL; + struct kho_radix_node *node = tree->root; + struct kho_radix_node *new_node; + unsigned int i, idx, anchor_idx; + struct kho_radix_leaf *leaf; + int err = 0; - physxa = xa_load(&track->orders, order); - if (WARN_ON_ONCE(!physxa)) - return; + if (WARN_ON_ONCE(!tree->root)) + return -EINVAL; - bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (WARN_ON_ONCE(!bits)) - return; + might_sleep(); + + guard(mutex)(&tree->lock); + + /* Go from high levels to low levels */ + for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) { + idx = kho_radix_get_table_index(key, i); + + if (node->table[idx]) { + node = phys_to_virt(node->table[idx]); + continue; + } + + /* Next node is empty, create a new node for it */ + new_node = (struct kho_radix_node *)get_zeroed_page(GFP_KERNEL); + if (!new_node) { + err = -ENOMEM; + goto err_free_nodes; + } + + node->table[idx] = virt_to_phys(new_node); + + /* + * Capture the node where the new branch starts for cleanup + * if allocation fails. + */ + if (!anchor_node) { + anchor_node = node; + anchor_idx = idx; + } + intermediate_nodes[i] = new_node; + + node = new_node; + } + + /* Handle the leaf level bitmap (level 0) */ + idx = kho_radix_get_bitmap_index(key); + leaf = (struct kho_radix_leaf *)node; + __set_bit(idx, leaf->bitmap); - clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); + return 0; + +err_free_nodes: + for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) { + if (intermediate_nodes[i]) + free_page((unsigned long)intermediate_nodes[i]); + } + if (anchor_node) + anchor_node->table[anchor_idx] = 0; + + return err; } +EXPORT_SYMBOL_GPL(kho_radix_add_page); -static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, - unsigned long end_pfn) +/** + * kho_radix_del_page - Removes a page's preservation status from the radix tree. + * @tree: The KHO radix tree. + * @pfn: The page frame number of the page to unpreserve. + * @order: The order of the page. + * + * This function traverses the radix tree and clears the bit corresponding to + * the page, effectively removing its "preserved" status. It does not free + * the tree's intermediate nodes, even if they become empty. + */ +void kho_radix_del_page(struct kho_radix_tree *tree, unsigned long pfn, + unsigned int order) { - unsigned int order; + unsigned long key = kho_radix_encode_key(PFN_PHYS(pfn), order); + struct kho_radix_node *node = tree->root; + struct kho_radix_leaf *leaf; + unsigned int i, idx; - while (pfn < end_pfn) { - order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + if (WARN_ON_ONCE(!tree->root)) + return; - __kho_unpreserve_order(track, pfn, order); + might_sleep(); - pfn += 1 << order; + guard(mutex)(&tree->lock); + + /* Go from high levels to low levels */ + for (i = KHO_TREE_MAX_DEPTH - 1; i > 0; i--) { + idx = kho_radix_get_table_index(key, i); + + /* + * Attempting to delete a page that has not been preserved, + * return with a warning. + */ + if (WARN_ON(!node->table[idx])) + return; + + node = phys_to_virt(node->table[idx]); } + + /* Handle the leaf level bitmap (level 0) */ + leaf = (struct kho_radix_leaf *)node; + idx = kho_radix_get_bitmap_index(key); + __clear_bit(idx, leaf->bitmap); } +EXPORT_SYMBOL_GPL(kho_radix_del_page); -static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, - unsigned int order) +static int kho_radix_walk_leaf(struct kho_radix_leaf *leaf, + unsigned long key, + kho_radix_tree_walk_callback_t cb) { - struct kho_mem_phys_bits *bits; - struct kho_mem_phys *physxa, *new_physxa; - const unsigned long pfn_high = pfn >> order; + unsigned long *bitmap = (unsigned long *)leaf; + unsigned int order; + phys_addr_t phys; + unsigned int i; + int err; - might_sleep(); - physxa = xa_load(&track->orders, order); - if (!physxa) { - int err; + for_each_set_bit(i, bitmap, PAGE_SIZE * BITS_PER_BYTE) { + phys = kho_radix_decode_key(key | i, &order); + err = cb(phys, order); + if (err) + return err; + } + + return 0; +} - new_physxa = kzalloc_obj(*physxa); - if (!new_physxa) - return -ENOMEM; +static int __kho_radix_walk_tree(struct kho_radix_node *root, + unsigned int level, unsigned long start, + kho_radix_tree_walk_callback_t cb) +{ + struct kho_radix_node *node; + struct kho_radix_leaf *leaf; + unsigned long key, i; + unsigned int shift; + int err; + + for (i = 0; i < PAGE_SIZE / sizeof(phys_addr_t); i++) { + if (!root->table[i]) + continue; - xa_init(&new_physxa->phys_bits); - physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa, - GFP_KERNEL); + shift = ((level - 1) * KHO_TABLE_SIZE_LOG2) + + KHO_BITMAP_SIZE_LOG2; + key = start | (i << shift); - err = xa_err(physxa); - if (err || physxa) { - xa_destroy(&new_physxa->phys_bits); - kfree(new_physxa); + node = phys_to_virt(root->table[i]); - if (err) - return err; + if (level == 1) { + /* + * we are at level 1, + * node is pointing to the level 0 bitmap. + */ + leaf = (struct kho_radix_leaf *)node; + err = kho_radix_walk_leaf(leaf, key, cb); } else { - physxa = new_physxa; + err = __kho_radix_walk_tree(node, level - 1, + key, cb); } + + if (err) + return err; } - bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (IS_ERR(bits)) - return PTR_ERR(bits); + return 0; +} + +/** + * kho_radix_walk_tree - Traverses the radix tree and calls a callback for each preserved page. + * @tree: A pointer to the KHO radix tree to walk. + * @cb: A callback function of type kho_radix_tree_walk_callback_t that will be + * invoked for each preserved page found in the tree. The callback receives + * the physical address and order of the preserved page. + * + * This function walks the radix tree, searching from the specified top level + * down to the lowest level (level 0). For each preserved page found, it invokes + * the provided callback, passing the page's physical address and order. + * + * Return: 0 if the walk completed the specified tree, or the non-zero return + * value from the callback that stopped the walk. + */ +int kho_radix_walk_tree(struct kho_radix_tree *tree, + kho_radix_tree_walk_callback_t cb) +{ + if (WARN_ON_ONCE(!tree->root)) + return -EINVAL; + + guard(mutex)(&tree->lock); - set_bit(pfn_high % PRESERVE_BITS, bits->preserve); + return __kho_radix_walk_tree(tree->root, KHO_TREE_MAX_DEPTH - 1, 0, cb); +} +EXPORT_SYMBOL_GPL(kho_radix_walk_tree); - return 0; +static void __kho_unpreserve(struct kho_radix_tree *tree, + unsigned long pfn, unsigned long end_pfn) +{ + unsigned int order; + + while (pfn < end_pfn) { + order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + + kho_radix_del_page(tree, pfn, order); + + pfn += 1 << order; + } } /* For physically contiguous 0-order pages. */ static void kho_init_pages(struct page *page, unsigned long nr_pages) { - for (unsigned long i = 0; i < nr_pages; i++) + for (unsigned long i = 0; i < nr_pages; i++) { set_page_count(page + i, 1); + /* Clear each page's codetag to avoid accounting mismatch. */ + clear_page_tag_ref(page + i); + } } static void kho_init_folio(struct page *page, unsigned int order) @@ -229,6 +385,8 @@ static void kho_init_folio(struct page *page, unsigned int order) /* Head page gets refcount of 1. */ set_page_count(page, 1); + /* Clear head page's codetag to avoid accounting mismatch. */ + clear_page_tag_ref(page); /* For higher order folios, tail pages get a page count of zero. */ for (unsigned long i = 1; i < nr_pages; i++) @@ -253,7 +411,7 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) * check also implicitly makes sure phys is order-aligned since for * non-order-aligned phys addresses, magic will never be set. */ - if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER)) + if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC)) return NULL; nr_pages = (1 << info.order); @@ -265,14 +423,6 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) else kho_init_pages(page, nr_pages); - /* Always mark headpage's codetag as empty to avoid accounting mismatch */ - clear_page_tag_ref(page); - if (!is_folio) { - /* Also do that for the non-compound tail pages */ - for (unsigned int i = 1; i < nr_pages; i++) - clear_page_tag_ref(page + i); - } - adjust_managed_page_count(page, nr_pages); return page; } @@ -321,161 +471,24 @@ struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages) } EXPORT_SYMBOL_GPL(kho_restore_pages); -/* Serialize and deserialize struct kho_mem_phys across kexec - * - * Record all the bitmaps in a linked list of pages for the next kernel to - * process. Each chunk holds bitmaps of the same order and each block of bitmaps - * starts at a given physical address. This allows the bitmaps to be sparse. The - * xarray is used to store them in a tree while building up the data structure, - * but the KHO successor kernel only needs to process them once in order. - * - * All of this memory is normal kmalloc() memory and is not marked for - * preservation. The successor kernel will remain isolated to the scratch space - * until it completes processing this list. Once processed all the memory - * storing these ranges will be marked as free. - */ - -struct khoser_mem_bitmap_ptr { - phys_addr_t phys_start; - DECLARE_KHOSER_PTR(bitmap, struct kho_mem_phys_bits *); -}; - -struct khoser_mem_chunk_hdr { - DECLARE_KHOSER_PTR(next, struct khoser_mem_chunk *); - unsigned int order; - unsigned int num_elms; -}; - -#define KHOSER_BITMAP_SIZE \ - ((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \ - sizeof(struct khoser_mem_bitmap_ptr)) - -struct khoser_mem_chunk { - struct khoser_mem_chunk_hdr hdr; - struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE]; -}; - -static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); - -static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, - unsigned long order) -{ - struct khoser_mem_chunk *chunk __free(free_page) = NULL; - - chunk = (void *)get_zeroed_page(GFP_KERNEL); - if (!chunk) - return ERR_PTR(-ENOMEM); - - if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) - return ERR_PTR(-EINVAL); - - chunk->hdr.order = order; - if (cur_chunk) - KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); - return no_free_ptr(chunk); -} - -static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) +static int __init kho_preserved_memory_reserve(phys_addr_t phys, + unsigned int order) { - struct khoser_mem_chunk *chunk = first_chunk; - - while (chunk) { - struct khoser_mem_chunk *tmp = chunk; - - chunk = KHOSER_LOAD_PTR(chunk->hdr.next); - free_page((unsigned long)tmp); - } -} - -/* - * Update memory map property, if old one is found discard it via - * kho_mem_ser_free(). - */ -static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) -{ - void *ptr; - u64 phys; - - ptr = fdt_getprop_w(kho_out.fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, NULL); - - /* Check and discard previous memory map */ - phys = get_unaligned((u64 *)ptr); - if (phys) - kho_mem_ser_free((struct khoser_mem_chunk *)phys_to_virt(phys)); - - /* Update with the new value */ - phys = first_chunk ? (u64)virt_to_phys(first_chunk) : 0; - put_unaligned(phys, (u64 *)ptr); -} - -static int kho_mem_serialize(struct kho_out *kho_out) -{ - struct khoser_mem_chunk *first_chunk = NULL; - struct khoser_mem_chunk *chunk = NULL; - struct kho_mem_phys *physxa; - unsigned long order; - int err = -ENOMEM; - - xa_for_each(&kho_out->track.orders, order, physxa) { - struct kho_mem_phys_bits *bits; - unsigned long phys; - - chunk = new_chunk(chunk, order); - if (IS_ERR(chunk)) { - err = PTR_ERR(chunk); - goto err_free; - } - - if (!first_chunk) - first_chunk = chunk; - - xa_for_each(&physxa->phys_bits, phys, bits) { - struct khoser_mem_bitmap_ptr *elm; + union kho_page_info info; + struct page *page; + u64 sz; - if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { - chunk = new_chunk(chunk, order); - if (IS_ERR(chunk)) { - err = PTR_ERR(chunk); - goto err_free; - } - } + sz = 1 << (order + PAGE_SHIFT); + page = phys_to_page(phys); - elm = &chunk->bitmaps[chunk->hdr.num_elms]; - chunk->hdr.num_elms++; - elm->phys_start = (phys * PRESERVE_BITS) - << (order + PAGE_SHIFT); - KHOSER_STORE_PTR(elm->bitmap, bits); - } - } - - kho_update_memory_map(first_chunk); + /* Reserve the memory preserved in KHO in memblock */ + memblock_reserve(phys, sz); + memblock_reserved_mark_noinit(phys, sz); + info.magic = KHO_PAGE_MAGIC; + info.order = order; + page->private = info.page_private; return 0; - -err_free: - kho_mem_ser_free(first_chunk); - return err; -} - -static void __init deserialize_bitmap(unsigned int order, - struct khoser_mem_bitmap_ptr *elm) -{ - struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); - unsigned long bit; - - for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) { - int sz = 1 << (order + PAGE_SHIFT); - phys_addr_t phys = - elm->phys_start + (bit << (order + PAGE_SHIFT)); - struct page *page = phys_to_page(phys); - union kho_page_info info; - - memblock_reserve(phys, sz); - memblock_reserved_mark_noinit(phys, sz); - info.magic = KHO_PAGE_MAGIC; - info.order = order; - page->private = info.page_private; - } } /* Returns physical address of the preserved memory map from FDT */ @@ -486,25 +499,13 @@ static phys_addr_t __init kho_get_mem_map_phys(const void *fdt) mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len); if (!mem_ptr || len != sizeof(u64)) { - pr_err("failed to get preserved memory bitmaps\n"); + pr_err("failed to get preserved memory map\n"); return 0; } return get_unaligned((const u64 *)mem_ptr); } -static void __init kho_mem_deserialize(struct khoser_mem_chunk *chunk) -{ - while (chunk) { - unsigned int i; - - for (i = 0; i != chunk->hdr.num_elms; i++) - deserialize_bitmap(chunk->hdr.order, - &chunk->bitmaps[i]); - chunk = KHOSER_LOAD_PTR(chunk->hdr.next); - } -} - /* * With KHO enabled, memory can become fragmented because KHO regions may * be anywhere in physical address space. The scratch regions give us a @@ -815,14 +816,14 @@ EXPORT_SYMBOL_GPL(kho_remove_subtree); */ int kho_preserve_folio(struct folio *folio) { + struct kho_radix_tree *tree = &kho_out.radix_tree; const unsigned long pfn = folio_pfn(folio); const unsigned int order = folio_order(folio); - struct kho_mem_track *track = &kho_out.track; if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) return -EINVAL; - return __kho_preserve_order(track, pfn, order); + return kho_radix_add_page(tree, pfn, order); } EXPORT_SYMBOL_GPL(kho_preserve_folio); @@ -836,11 +837,11 @@ EXPORT_SYMBOL_GPL(kho_preserve_folio); */ void kho_unpreserve_folio(struct folio *folio) { + struct kho_radix_tree *tree = &kho_out.radix_tree; const unsigned long pfn = folio_pfn(folio); const unsigned int order = folio_order(folio); - struct kho_mem_track *track = &kho_out.track; - __kho_unpreserve_order(track, pfn, order); + kho_radix_del_page(tree, pfn, order); } EXPORT_SYMBOL_GPL(kho_unpreserve_folio); @@ -856,7 +857,7 @@ EXPORT_SYMBOL_GPL(kho_unpreserve_folio); */ int kho_preserve_pages(struct page *page, unsigned long nr_pages) { - struct kho_mem_track *track = &kho_out.track; + struct kho_radix_tree *tree = &kho_out.radix_tree; const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn = start_pfn; @@ -869,10 +870,18 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages) } while (pfn < end_pfn) { - const unsigned int order = + unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - err = __kho_preserve_order(track, pfn, order); + /* + * Make sure all the pages in a single preservation are in the + * same NUMA node. The restore machinery can not cope with a + * preservation spanning multiple NUMA nodes. + */ + while (pfn_to_nid(pfn) != pfn_to_nid(pfn + (1UL << order) - 1)) + order--; + + err = kho_radix_add_page(tree, pfn, order); if (err) { failed_pfn = pfn; break; @@ -882,7 +891,7 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages) } if (err) - __kho_unpreserve(track, start_pfn, failed_pfn); + __kho_unpreserve(tree, start_pfn, failed_pfn); return err; } @@ -900,11 +909,11 @@ EXPORT_SYMBOL_GPL(kho_preserve_pages); */ void kho_unpreserve_pages(struct page *page, unsigned long nr_pages) { - struct kho_mem_track *track = &kho_out.track; + struct kho_radix_tree *tree = &kho_out.radix_tree; const unsigned long start_pfn = page_to_pfn(page); const unsigned long end_pfn = start_pfn + nr_pages; - __kho_unpreserve(track, start_pfn, end_pfn); + __kho_unpreserve(tree, start_pfn, end_pfn); } EXPORT_SYMBOL_GPL(kho_unpreserve_pages); @@ -963,14 +972,14 @@ err_free: static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, unsigned short order) { - struct kho_mem_track *track = &kho_out.track; + struct kho_radix_tree *tree = &kho_out.radix_tree; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); - __kho_unpreserve(track, pfn, pfn + 1); + __kho_unpreserve(tree, pfn, pfn + 1); for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { pfn = PHYS_PFN(chunk->phys[i]); - __kho_unpreserve(track, pfn, pfn + (1 << order)); + __kho_unpreserve(tree, pfn, pfn + (1 << order)); } } @@ -1077,6 +1086,7 @@ EXPORT_SYMBOL_GPL(kho_unpreserve_vmalloc); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) { struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); + kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_PROT_NORMAL; unsigned int align, order, shift, vm_flags; unsigned long total_pages, contig_pages; unsigned long addr, size; @@ -1128,7 +1138,8 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) goto err_free_pages_array; area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift, - vm_flags, VMALLOC_START, VMALLOC_END, + vm_flags | VM_UNINITIALIZED, + VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); if (!area) @@ -1143,6 +1154,13 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) area->nr_pages = total_pages; area->pages = pages; + if (vm_flags & VM_ALLOC) + kasan_flags |= KASAN_VMALLOC_VM_ALLOC; + + area->addr = kasan_unpoison_vmalloc(area->addr, total_pages * PAGE_SIZE, + kasan_flags); + clear_vm_uninitialized_flag(area); + return area->addr; err_free_vm_area: @@ -1239,33 +1257,9 @@ void kho_restore_free(void *mem) } EXPORT_SYMBOL_GPL(kho_restore_free); -int kho_finalize(void) -{ - int ret; - - if (!kho_enable) - return -EOPNOTSUPP; - - guard(mutex)(&kho_out.lock); - ret = kho_mem_serialize(&kho_out); - if (ret) - return ret; - - kho_out.finalized = true; - - return 0; -} - -bool kho_finalized(void) -{ - guard(mutex)(&kho_out.lock); - return kho_out.finalized; -} - struct kho_in { phys_addr_t fdt_phys; phys_addr_t scratch_phys; - phys_addr_t mem_map_phys; struct kho_debugfs dbg; }; @@ -1333,18 +1327,46 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) } EXPORT_SYMBOL_GPL(kho_retrieve_subtree); +static int __init kho_mem_retrieve(const void *fdt) +{ + struct kho_radix_tree tree; + const phys_addr_t *mem; + int len; + + /* Retrieve the KHO radix tree from passed-in FDT. */ + mem = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len); + + if (!mem || len != sizeof(*mem)) { + pr_err("failed to get preserved KHO memory tree\n"); + return -ENOENT; + } + + if (!*mem) + return -EINVAL; + + tree.root = phys_to_virt(*mem); + mutex_init(&tree.lock); + return kho_radix_walk_tree(&tree, kho_preserved_memory_reserve); +} + static __init int kho_out_fdt_setup(void) { + struct kho_radix_tree *tree = &kho_out.radix_tree; void *root = kho_out.fdt; - u64 empty_mem_map = 0; + u64 preserved_mem_tree_pa; int err; err = fdt_create(root, PAGE_SIZE); err |= fdt_finish_reservemap(root); err |= fdt_begin_node(root, ""); err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); - err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map, - sizeof(empty_mem_map)); + + preserved_mem_tree_pa = virt_to_phys(tree->root); + + err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, + &preserved_mem_tree_pa, + sizeof(preserved_mem_tree_pa)); + err |= fdt_end_node(root); err |= fdt_finish(root); @@ -1353,16 +1375,23 @@ static __init int kho_out_fdt_setup(void) static __init int kho_init(void) { + struct kho_radix_tree *tree = &kho_out.radix_tree; const void *fdt = kho_get_fdt(); int err = 0; if (!kho_enable) return 0; + tree->root = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!tree->root) { + err = -ENOMEM; + goto err_free_scratch; + } + kho_out.fdt = kho_alloc_preserve(PAGE_SIZE); if (IS_ERR(kho_out.fdt)) { err = PTR_ERR(kho_out.fdt); - goto err_free_scratch; + goto err_free_kho_radix_tree_root; } err = kho_debugfs_init(); @@ -1408,6 +1437,9 @@ static __init int kho_init(void) err_free_fdt: kho_unpreserve_free(kho_out.fdt); +err_free_kho_radix_tree_root: + kfree(tree->root); + tree->root = NULL; err_free_scratch: kho_out.fdt = NULL; for (int i = 0; i < kho_scratch_cnt; i++) { @@ -1447,10 +1479,12 @@ static void __init kho_release_scratch(void) void __init kho_memory_init(void) { - if (kho_in.mem_map_phys) { + if (kho_in.scratch_phys) { kho_scratch = phys_to_virt(kho_in.scratch_phys); kho_release_scratch(); - kho_mem_deserialize(phys_to_virt(kho_in.mem_map_phys)); + + if (kho_mem_retrieve(kho_get_fdt())) + kho_in.fdt_phys = 0; } else { kho_reserve_scratch(); } @@ -1528,7 +1562,6 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, kho_in.fdt_phys = fdt_phys; kho_in.scratch_phys = scratch_phys; - kho_in.mem_map_phys = mem_map_phys; kho_scratch_cnt = scratch_cnt; populated = true; diff --git a/kernel/liveupdate/kexec_handover_debugfs.c b/kernel/liveupdate/kexec_handover_debugfs.c index 2f93939168ab..acf368222682 100644 --- a/kernel/liveupdate/kexec_handover_debugfs.c +++ b/kernel/liveupdate/kexec_handover_debugfs.c @@ -13,6 +13,7 @@ #include <linux/io.h> #include <linux/libfdt.h> #include <linux/mm.h> +#include <linux/kho/abi/kexec_handover.h> #include "kexec_handover_internal.h" static struct dentry *debugfs_root; @@ -75,24 +76,6 @@ void kho_debugfs_fdt_remove(struct kho_debugfs *dbg, void *fdt) } } -static int kho_out_finalize_get(void *data, u64 *val) -{ - *val = kho_finalized(); - - return 0; -} - -static int kho_out_finalize_set(void *data, u64 val) -{ - if (val) - return kho_finalize(); - else - return -EINVAL; -} - -DEFINE_DEBUGFS_ATTRIBUTE(kho_out_finalize_fops, kho_out_finalize_get, - kho_out_finalize_set, "%llu\n"); - static int scratch_phys_show(struct seq_file *m, void *v) { for (int i = 0; i < kho_scratch_cnt; i++) @@ -139,7 +122,7 @@ __init void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt) const char *name = fdt_get_name(fdt, child, NULL); const u64 *fdt_phys; - fdt_phys = fdt_getprop(fdt, child, "fdt", &len); + fdt_phys = fdt_getprop(fdt, child, KHO_FDT_SUB_TREE_PROP_NAME, &len); if (!fdt_phys) continue; if (len != sizeof(*fdt_phys)) { @@ -198,11 +181,6 @@ __init int kho_out_debugfs_init(struct kho_debugfs *dbg) if (IS_ERR(f)) goto err_rmdir; - f = debugfs_create_file("finalize", 0600, dir, NULL, - &kho_out_finalize_fops); - if (IS_ERR(f)) - goto err_rmdir; - dbg->dir = dir; dbg->sub_fdt_dir = sub_fdt_dir; return 0; diff --git a/kernel/liveupdate/kexec_handover_internal.h b/kernel/liveupdate/kexec_handover_internal.h index 0202c85ad14f..9a832a35254c 100644 --- a/kernel/liveupdate/kexec_handover_internal.h +++ b/kernel/liveupdate/kexec_handover_internal.h @@ -22,9 +22,6 @@ struct kho_debugfs {}; extern struct kho_scratch *kho_scratch; extern unsigned int kho_scratch_cnt; -bool kho_finalized(void); -int kho_finalize(void); - #ifdef CONFIG_KEXEC_HANDOVER_DEBUGFS int kho_debugfs_init(void); void kho_in_debugfs_init(struct kho_debugfs *dbg, const void *fdt); diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index dda7bb57d421..84ac728d63ba 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -230,17 +230,7 @@ int liveupdate_reboot(void) luo_flb_serialize(); - err = kho_finalize(); - if (err) { - pr_err("kho_finalize failed %d\n", err); - /* - * kho_finalize() may return libfdt errors, to aboid passing to - * userspace unknown errors, change this to EAGAIN. - */ - err = -EAGAIN; - } - - return err; + return 0; } /** diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index 8c79058253e1..5acee4174bf0 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -134,9 +134,12 @@ static LIST_HEAD(luo_file_handler_list); * state that is not preserved. Set by the handler's .preserve() * callback, and must be freed in the handler's .unpreserve() * callback. - * @retrieved: A flag indicating whether a user/kernel in the new kernel has + * @retrieve_status: Status code indicating whether a user/kernel in the new kernel has * successfully called retrieve() on this file. This prevents - * multiple retrieval attempts. + * multiple retrieval attempts. A value of 0 means a retrieve() + * has not been attempted, a positive value means the retrieve() + * was successful, and a negative value means the retrieve() + * failed, and the value is the error code of the call. * @mutex: A mutex that protects the fields of this specific instance * (e.g., @retrieved, @file), ensuring that operations like * retrieving or finishing a file are atomic. @@ -161,7 +164,7 @@ struct luo_file { struct file *file; u64 serialized_data; void *private_data; - bool retrieved; + int retrieve_status; struct mutex mutex; struct list_head list; u64 token; @@ -298,7 +301,6 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) luo_file->file = file; luo_file->fh = fh; luo_file->token = token; - luo_file->retrieved = false; mutex_init(&luo_file->mutex); args.handler = fh; @@ -577,7 +579,12 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token, return -ENOENT; guard(mutex)(&luo_file->mutex); - if (luo_file->retrieved) { + if (luo_file->retrieve_status < 0) { + /* Retrieve was attempted and it failed. Return the error code. */ + return luo_file->retrieve_status; + } + + if (luo_file->retrieve_status > 0) { /* * Someone is asking for this file again, so get a reference * for them. @@ -590,16 +597,19 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token, args.handler = luo_file->fh; args.serialized_data = luo_file->serialized_data; err = luo_file->fh->ops->retrieve(&args); - if (!err) { - luo_file->file = args.file; - - /* Get reference so we can keep this file in LUO until finish */ - get_file(luo_file->file); - *filep = luo_file->file; - luo_file->retrieved = true; + if (err) { + /* Keep the error code for later use. */ + luo_file->retrieve_status = err; + return err; } - return err; + luo_file->file = args.file; + /* Get reference so we can keep this file in LUO until finish */ + get_file(luo_file->file); + *filep = luo_file->file; + luo_file->retrieve_status = 1; + + return 0; } static int luo_file_can_finish_one(struct luo_file_set *file_set, @@ -615,7 +625,7 @@ static int luo_file_can_finish_one(struct luo_file_set *file_set, args.handler = luo_file->fh; args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; - args.retrieved = luo_file->retrieved; + args.retrieve_status = luo_file->retrieve_status; can_finish = luo_file->fh->ops->can_finish(&args); } @@ -632,7 +642,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set, args.handler = luo_file->fh; args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; - args.retrieved = luo_file->retrieved; + args.retrieve_status = luo_file->retrieve_status; luo_file->fh->ops->finish(&args); luo_flb_file_finish(luo_file->fh); @@ -788,7 +798,6 @@ int luo_file_deserialize(struct luo_file_set *file_set, luo_file->file = NULL; luo_file->serialized_data = file_ser[i].data; luo_file->token = file_ser[i].token; - luo_file->retrieved = false; mutex_init(&luo_file->mutex); list_add_tail(&luo_file->list, &file_set->files_list); } diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c index 783677295640..25ae704d7787 100644 --- a/kernel/liveupdate/luo_session.c +++ b/kernel/liveupdate/luo_session.c @@ -558,8 +558,13 @@ int luo_session_deserialize(void) } scoped_guard(mutex, &session->mutex) { - luo_file_deserialize(&session->file_set, - &sh->ser[i].file_set_ser); + err = luo_file_deserialize(&session->file_set, + &sh->ser[i].file_set_ser); + } + if (err) { + pr_warn("Failed to deserialize files for session [%s] %pe\n", + session->name, ERR_PTR(err)); + return err; } } diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index a114949eeed5..cee1901d4cff 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,6 +3,11 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n +CONTEXT_ANALYSIS_mutex.o := y +CONTEXT_ANALYSIS_rtmutex_api.o := y +CONTEXT_ANALYSIS_ww_rt_mutex.o := y +CONTEXT_ANALYSIS_rwsem.o := y + obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o # Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance. diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 2c6b02d4699b..785decd9d0c0 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -37,9 +37,8 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) { lockdep_assert_held(&lock->wait_lock); - DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); + DEBUG_LOCKS_WARN_ON(!lock->first_waiter); DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); } void debug_mutex_free_waiter(struct mutex_waiter *waiter) @@ -54,15 +53,14 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, lockdep_assert_held(&lock->wait_lock); /* Current thread can't be already blocked (since it's executing!) */ - DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task)); + DEBUG_LOCKS_WARN_ON(get_task_blocked_on(task)); } void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { - struct mutex *blocked_on = __get_task_blocked_on(task); + struct mutex *blocked_on = get_task_blocked_on(task); - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task != task); DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock); @@ -74,7 +72,6 @@ void debug_mutex_unlock(struct mutex *lock) { if (likely(debug_locks)) { DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); } } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2a1d165b3167..186b463fe326 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -46,8 +46,9 @@ static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); - raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); + scoped_guard (raw_spinlock_init, &lock->wait_lock) { + lock->first_waiter = NULL; + } #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif @@ -150,6 +151,7 @@ EXPORT_SYMBOL(mutex_init_generic); * follow with a __mutex_trylock() before failing. */ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) + __cond_acquires(true, lock) { unsigned long curr = (unsigned long)current; unsigned long zero = 0UL; @@ -163,6 +165,7 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) } static __always_inline bool __mutex_unlock_fast(struct mutex *lock) + __cond_releases(true, lock) { unsigned long curr = (unsigned long)current; @@ -171,7 +174,7 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock) #else /* !CONFIG_DEBUG_LOCK_ALLOC */ -void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key) +void mutex_init_lockdep(struct mutex *lock, const char *name, struct lock_class_key *key) { __mutex_init_generic(lock); @@ -181,7 +184,7 @@ void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_k debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); } -EXPORT_SYMBOL(mutex_init_lockep); +EXPORT_SYMBOL(mutex_init_lockdep); #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) @@ -194,33 +197,44 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) atomic_long_andnot(flag, &lock->owner); } -static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter) -{ - return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter; -} - /* * Add @waiter to a given location in the lock wait_list and set the * FLAG_WAITERS flag if it's the first waiter. */ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct list_head *list) + struct mutex_waiter *first) + __must_hold(&lock->wait_lock) { hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); debug_mutex_add_waiter(lock, waiter, current); - list_add_tail(&waiter->list, list); - if (__mutex_waiter_is_first(lock, waiter)) + if (!first) + first = lock->first_waiter; + + if (first) { + list_add_tail(&waiter->list, &first->list); + } else { + INIT_LIST_HEAD(&waiter->list); + lock->first_waiter = waiter; __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); + } } static void __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { - list_del(&waiter->list); - if (likely(list_empty(&lock->wait_list))) + if (list_empty(&waiter->list)) { __mutex_clear_flag(lock, MUTEX_FLAGS); + lock->first_waiter = NULL; + } else { + if (lock->first_waiter == waiter) { + lock->first_waiter = list_first_entry(&waiter->list, + struct mutex_waiter, list); + } + list_del(&waiter->list); + } debug_mutex_remove_waiter(lock, waiter, current); hung_task_clear_blocker(); @@ -259,7 +273,8 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -static void __sched __mutex_lock_slowpath(struct mutex *lock); +static void __sched __mutex_lock_slowpath(struct mutex *lock) + __acquires(lock); /** * mutex_lock - acquire the mutex @@ -340,7 +355,7 @@ bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, * Similarly, stop spinning if we are no longer the * first waiter. */ - if (waiter && !__mutex_waiter_is_first(lock, waiter)) + if (waiter && data_race(lock->first_waiter != waiter)) return false; return true; @@ -525,7 +540,8 @@ mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, } #endif -static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip); +static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) + __releases(lock); /** * mutex_unlock - release the mutex @@ -565,6 +581,7 @@ EXPORT_SYMBOL(mutex_unlock); * of a unlocked mutex is not allowed. */ void __sched ww_mutex_unlock(struct ww_mutex *lock) + __no_context_analysis { __ww_mutex_unlock(lock); mutex_unlock(&lock->base); @@ -578,6 +595,7 @@ static __always_inline int __sched __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) + __cond_acquires(0, lock) { DEFINE_WAKE_Q(wake_q); struct mutex_waiter waiter; @@ -645,7 +663,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ - __mutex_add_waiter(lock, &waiter, &lock->wait_list); + __mutex_add_waiter(lock, &waiter, NULL); } else { /* * Add in stamp order, waking up waiters that must kill @@ -656,6 +674,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err_early_kill; } + raw_spin_lock(¤t->blocked_lock); __set_task_blocked_on(current, lock); set_current_state(state); trace_contention_begin(lock, LCB_F_MUTEX); @@ -669,8 +688,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas * the handoff. */ if (__mutex_trylock(lock)) - goto acquired; + break; + raw_spin_unlock(¤t->blocked_lock); /* * Check for signals and kill conditions while holding * wait_lock. This ensures the lock cancellation is ordered @@ -691,14 +711,16 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas schedule_preempt_disabled(); - first = __mutex_waiter_is_first(lock, &waiter); + first = lock->first_waiter == &waiter; + raw_spin_lock_irqsave(&lock->wait_lock, flags); + raw_spin_lock(¤t->blocked_lock); /* * As we likely have been woken up by task * that has cleared our blocked_on state, re-set * it to the lock we are trying to acquire. */ - set_task_blocked_on(current, lock); + __set_task_blocked_on(current, lock); set_current_state(state); /* * Here we order against unlock; we must either see it change @@ -709,33 +731,40 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas break; if (first) { - trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + bool opt_acquired; + /* * mutex_optimistic_spin() can call schedule(), so - * clear blocked on so we don't become unselectable + * we need to release these locks before calling it, + * and clear blocked on so we don't become unselectable * to run. */ - clear_task_blocked_on(current, lock); - if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) + __clear_task_blocked_on(current, lock); + raw_spin_unlock(¤t->blocked_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + opt_acquired = mutex_optimistic_spin(lock, ww_ctx, &waiter); + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + raw_spin_lock(¤t->blocked_lock); + __set_task_blocked_on(current, lock); + + if (opt_acquired) break; - set_task_blocked_on(current, lock); trace_contention_begin(lock, LCB_F_MUTEX); } - - raw_spin_lock_irqsave(&lock->wait_lock, flags); } - raw_spin_lock_irqsave(&lock->wait_lock, flags); -acquired: __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); + raw_spin_unlock(¤t->blocked_lock); if (ww_ctx) { /* * Wound-Wait; we stole the lock (!first_waiter), check the * waiters as anyone might want to wound us. */ - if (!ww_ctx->is_wait_die && - !__mutex_waiter_is_first(lock, &waiter)) + if (!ww_ctx->is_wait_die && lock->first_waiter != &waiter) __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); } @@ -756,11 +785,11 @@ skip_wait: return 0; err: - __clear_task_blocked_on(current, lock); + clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: - WARN_ON(__get_task_blocked_on(current)); + WARN_ON(get_task_blocked_on(current)); trace_contention_end(lock, ret); raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); @@ -772,6 +801,7 @@ err_early_kill: static int __sched __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) + __cond_acquires(0, lock) { return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false); } @@ -779,6 +809,7 @@ __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, static int __sched __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, unsigned long ip, struct ww_acquire_ctx *ww_ctx) + __cond_acquires(0, lock) { return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true); } @@ -826,6 +857,7 @@ void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { __mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); + __acquire(lock); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -834,6 +866,7 @@ void __sched _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); + __acquire(lock); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); @@ -862,12 +895,14 @@ mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) token = io_schedule_prepare(); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_, NULL, 0); + __acquire(lock); io_schedule_finish(token); } EXPORT_SYMBOL_GPL(mutex_lock_io_nested); static inline int ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + __cond_releases(nonzero, lock) { #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH unsigned tmp; @@ -929,13 +964,16 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); * Release the lock, slowpath: */ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) + __releases(lock) { struct task_struct *next = NULL; + struct mutex_waiter *waiter; DEFINE_WAKE_Q(wake_q); unsigned long owner; unsigned long flags; mutex_release(&lock->dep_map, ip); + __release(lock); /* * Release the lock before (potentially) taking the spinlock such that @@ -962,16 +1000,12 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_mutex_unlock(lock); - if (!list_empty(&lock->wait_list)) { - /* get the first entry from the wait-list: */ - struct mutex_waiter *waiter = - list_first_entry(&lock->wait_list, - struct mutex_waiter, list); - + waiter = lock->first_waiter; + if (waiter) { next = waiter->task; debug_mutex_wake_waiter(lock, waiter); - __clear_task_blocked_on(next, lock); + set_task_blocked_on_waking(next, lock); wake_q_add(&wake_q, next); } @@ -1061,24 +1095,29 @@ EXPORT_SYMBOL_GPL(mutex_lock_io); static noinline void __sched __mutex_lock_slowpath(struct mutex *lock) + __acquires(lock) { __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); + __acquire(lock); } static noinline int __sched __mutex_lock_killable_slowpath(struct mutex *lock) + __cond_acquires(0, lock) { return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); } static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock) + __cond_acquires(0, lock) { return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + __cond_acquires(0, lock) { return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, _RET_IP_, ctx); @@ -1087,6 +1126,7 @@ __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) static noinline int __sched __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + __cond_acquires(0, lock) { return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, _RET_IP_, ctx); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 9ad4da8cea00..3e263e98e5fc 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -7,6 +7,7 @@ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> */ #ifndef CONFIG_PREEMPT_RT +#include <linux/mutex.h> /* * This is the control structure for tasks blocked on mutex, which resides * on the blocked task's kernel stack: @@ -47,6 +48,12 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock) return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); } +static inline struct mutex *get_task_blocked_on(struct task_struct *p) +{ + guard(raw_spinlock_irqsave)(&p->blocked_lock); + return __get_task_blocked_on(p); +} + #ifdef CONFIG_DEBUG_MUTEXES extern void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c80902eacd79..ccaba6148b61 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -94,6 +94,7 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock, static __always_inline struct task_struct * rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) + __must_hold(&lock->wait_lock) { unsigned long val = (unsigned long)owner; @@ -105,6 +106,7 @@ rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) static __always_inline void rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) + __must_hold(&lock->wait_lock) { /* * lock->wait_lock is held but explicit acquire semantics are needed @@ -114,12 +116,14 @@ rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) } static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { /* lock->wait_lock is held so the unlock provides release semantics. */ WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL)); } static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); @@ -127,6 +131,7 @@ static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock) + __must_hold(&lock->wait_lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -328,6 +333,7 @@ static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, } static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); @@ -1206,6 +1212,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, enum rtmutex_chainwalk chwalk, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; @@ -1249,6 +1256,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, /* Check whether the waiter should back out immediately */ rtm = container_of(lock, struct rt_mutex, rtmutex); + __assume_ctx_lock(&rtm->rtmutex.wait_lock); res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q); if (res) { raw_spin_lock(&task->pi_lock); @@ -1356,6 +1364,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, } static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { int ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1505,7 +1514,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, * - the VCPU on which owner runs is preempted */ if (!owner_on_cpu(owner) || need_resched() || - !rt_mutex_waiter_is_top_waiter(lock, waiter)) { + !data_race(rt_mutex_waiter_is_top_waiter(lock, waiter))) { res = false; break; } @@ -1538,6 +1547,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, */ static void __sched remove_waiter(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); @@ -1613,6 +1623,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct task_struct *owner; int ret = 0; + __assume_ctx_lock(&rtm->rtmutex.wait_lock); + lockevent_inc(rtmutex_slow_block); for (;;) { /* Try to acquire the lock: */ @@ -1658,6 +1670,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, struct rt_mutex_base *lock, struct rt_mutex_waiter *w) + __must_hold(&lock->wait_lock) { /* * If the result is not -EDEADLOCK or the caller requested @@ -1694,11 +1707,13 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, enum rtmutex_chainwalk chwalk, struct rt_mutex_waiter *waiter, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); struct ww_mutex *ww = ww_container_of(rtm); int ret; + __assume_ctx_lock(&rtm->rtmutex.wait_lock); lockdep_assert_held(&lock->wait_lock); lockevent_inc(rtmutex_slowlock); @@ -1750,6 +1765,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct rt_mutex_waiter waiter; int ret; diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 59dbd29cb219..124219aea46e 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -526,6 +526,7 @@ static __always_inline int __mutex_lock_common(struct mutex *lock, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) + __acquires(lock) __no_context_analysis { int ret; @@ -647,6 +648,7 @@ EXPORT_SYMBOL(mutex_trylock); #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ void __sched mutex_unlock(struct mutex *lock) + __releases(lock) __no_context_analysis { mutex_release(&lock->dep_map, _RET_IP_); __rt_mutex_unlock(&lock->rtmutex); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index cf6ddd1b23a2..c38b7bdea7b3 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -79,12 +79,18 @@ struct rt_wake_q_head { * PI-futex support (proxy locking functions, etc.): */ extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, - struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock); + struct task_struct *proxy_owner) + __must_hold(&lock->wait_lock); + +extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock); + extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, - struct wake_q_head *); + struct wake_q_head *) + __must_hold(&lock->wait_lock); + extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); @@ -94,8 +100,9 @@ extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter); -extern int rt_mutex_futex_trylock(struct rt_mutex_base *l); -extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l); +extern int rt_mutex_futex_trylock(struct rt_mutex_base *lock); +extern int __rt_mutex_futex_trylock(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock); extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock, @@ -109,6 +116,7 @@ extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh); */ #ifdef CONFIG_RT_MUTEXES static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } @@ -120,6 +128,7 @@ static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) */ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); @@ -127,6 +136,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, } static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; @@ -170,9 +180,10 @@ enum rtmutex_chainwalk { static inline void __rt_mutex_base_init(struct rt_mutex_base *lock) { - raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT_CACHED; - lock->owner = NULL; + scoped_guard (raw_spinlock_init, &lock->wait_lock) { + lock->waiters = RB_ROOT_CACHED; + lock->owner = NULL; + } } /* Debug functions */ diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 9f4322c07486..82e078c0665a 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -186,6 +186,7 @@ static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, unsigned long flags) + __releases(&rwb->rtmutex.wait_lock) { struct rt_mutex_base *rtm = &rwb->rtmutex; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 24df4d98f7d2..bf647097369c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -72,7 +72,7 @@ #c, atomic_long_read(&(sem)->count), \ (unsigned long) sem->magic, \ atomic_long_read(&(sem)->owner), (long)current, \ - list_empty(&(sem)->wait_list) ? "" : "not ")) \ + rwsem_is_contended(sem) ? "" : "not ")) \ debug_locks_off(); \ } while (0) #else @@ -320,9 +320,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->magic = sem; #endif atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); atomic_long_set(&sem->owner, 0L); + scoped_guard (raw_spinlock_init, &sem->wait_lock) { + sem->first_waiter = NULL; + } #ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); #endif @@ -341,8 +342,6 @@ struct rwsem_waiter { unsigned long timeout; bool handoff_set; }; -#define rwsem_first_waiter(sem) \ - list_first_entry(&sem->wait_list, struct rwsem_waiter, list) enum rwsem_wake_type { RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ @@ -365,12 +364,22 @@ enum rwsem_wake_type { */ #define MAX_READERS_WAKEUP 0x100 -static inline void -rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +static inline +bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) + __must_hold(&sem->wait_lock) { - lockdep_assert_held(&sem->wait_lock); - list_add_tail(&waiter->list, &sem->wait_list); - /* caller will set RWSEM_FLAG_WAITERS */ + if (list_empty(&waiter->list)) { + sem->first_waiter = NULL; + return false; + } + + if (sem->first_waiter == waiter) { + sem->first_waiter = list_first_entry(&waiter->list, + struct rwsem_waiter, list); + } + list_del(&waiter->list); + + return true; } /* @@ -385,14 +394,24 @@ static inline bool rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { lockdep_assert_held(&sem->wait_lock); - list_del(&waiter->list); - if (likely(!list_empty(&sem->wait_list))) + if (__rwsem_del_waiter(sem, waiter)) return true; - atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); return false; } +static inline +struct rwsem_waiter *next_waiter(const struct rw_semaphore *sem, + const struct rwsem_waiter *waiter) + __must_hold(&sem->wait_lock) +{ + struct rwsem_waiter *next = list_first_entry(&waiter->list, + struct rwsem_waiter, list); + if (next == sem->first_waiter) + return NULL; + return next; +} + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -411,7 +430,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { - struct rwsem_waiter *waiter, *tmp; + struct rwsem_waiter *waiter, *next; long oldcount, woken = 0, adjustment = 0; struct list_head wlist; @@ -421,7 +440,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * Take a peek at the queue head waiter such that we can determine * the wakeup(s) to perform. */ - waiter = rwsem_first_waiter(sem); + waiter = sem->first_waiter; if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { @@ -506,25 +525,28 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * put them into wake_q to be woken up later. */ INIT_LIST_HEAD(&wlist); - list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + do { + next = next_waiter(sem, waiter); if (waiter->type == RWSEM_WAITING_FOR_WRITE) continue; woken++; list_move_tail(&waiter->list, &wlist); + if (sem->first_waiter == waiter) + sem->first_waiter = next; /* * Limit # of readers that can be woken up per wakeup call. */ if (unlikely(woken >= MAX_READERS_WAKEUP)) break; - } + } while ((waiter = next) != NULL); adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); oldcount = atomic_long_read(&sem->count); - if (list_empty(&sem->wait_list)) { + if (!sem->first_waiter) { /* * Combined with list_move_tail() above, this implies * rwsem_del_waiter(). @@ -545,7 +567,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, atomic_long_add(adjustment, &sem->count); /* 2nd pass */ - list_for_each_entry_safe(waiter, tmp, &wlist, list) { + list_for_each_entry_safe(waiter, next, &wlist, list) { struct task_struct *tsk; tsk = waiter->task; @@ -577,7 +599,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, struct wake_q_head *wake_q) __releases(&sem->wait_lock) { - bool first = rwsem_first_waiter(sem) == waiter; + bool first = sem->first_waiter == waiter; wake_q_init(wake_q); @@ -602,8 +624,9 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, */ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, struct rwsem_waiter *waiter) + __must_hold(&sem->wait_lock) { - struct rwsem_waiter *first = rwsem_first_waiter(sem); + struct rwsem_waiter *first = sem->first_waiter; long count, new; lockdep_assert_held(&sem->wait_lock); @@ -639,7 +662,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, new |= RWSEM_WRITER_LOCKED; new &= ~RWSEM_FLAG_HANDOFF; - if (list_is_singular(&sem->wait_list)) + if (list_empty(&first->list)) new &= ~RWSEM_FLAG_WAITERS; } } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); @@ -659,7 +682,8 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on * success. */ - list_del(&waiter->list); + __rwsem_del_waiter(sem, waiter); + rwsem_set_owner(sem); return true; } @@ -994,7 +1018,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat { long adjustment = -RWSEM_READER_BIAS; long rcnt = (count >> RWSEM_READER_SHIFT); - struct rwsem_waiter waiter; + struct rwsem_waiter waiter, *first; DEFINE_WAKE_Q(wake_q); /* @@ -1019,7 +1043,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat */ if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) { raw_spin_lock_irq(&sem->wait_lock); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); @@ -1035,7 +1059,8 @@ queue: waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) { + first = sem->first_waiter; + if (!first) { /* * In case the wait queue is empty and the lock isn't owned * by a writer, this reader can exit the slowpath and return @@ -1051,8 +1076,11 @@ queue: return sem; } adjustment += RWSEM_FLAG_WAITERS; + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter = &waiter; + } else { + list_add_tail(&waiter.list, &first->list); } - rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); @@ -1110,7 +1138,7 @@ out_nolock: static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { - struct rwsem_waiter waiter; + struct rwsem_waiter waiter, *first; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ @@ -1129,10 +1157,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - rwsem_add_waiter(sem, &waiter); - /* we're now waiting on the lock */ - if (rwsem_first_waiter(sem) != &waiter) { + first = sem->first_waiter; + if (first) { + list_add_tail(&waiter.list, &first->list); rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count), &wake_q); if (!wake_q_empty(&wake_q)) { @@ -1145,6 +1173,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) raw_spin_lock_irq(&sem->wait_lock); } } else { + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter = &waiter; atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); } @@ -1218,7 +1248,7 @@ static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -1239,7 +1269,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -1532,6 +1562,7 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) * lock for reading */ void __sched down_read(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -1541,6 +1572,7 @@ void __sched down_read(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read); int __sched down_read_interruptible(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -1555,6 +1587,7 @@ int __sched down_read_interruptible(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read_interruptible); int __sched down_read_killable(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -1572,6 +1605,7 @@ EXPORT_SYMBOL(down_read_killable); * trylock for reading -- returns 1 if successful, 0 if contention */ int down_read_trylock(struct rw_semaphore *sem) + __no_context_analysis { int ret = __down_read_trylock(sem); @@ -1585,6 +1619,7 @@ EXPORT_SYMBOL(down_read_trylock); * lock for writing */ void __sched down_write(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -1596,6 +1631,7 @@ EXPORT_SYMBOL(down_write); * lock for writing */ int __sched down_write_killable(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -1614,6 +1650,7 @@ EXPORT_SYMBOL(down_write_killable); * trylock for writing -- returns 1 if successful, 0 if contention */ int down_write_trylock(struct rw_semaphore *sem) + __no_context_analysis { int ret = __down_write_trylock(sem); @@ -1628,6 +1665,7 @@ EXPORT_SYMBOL(down_write_trylock); * release a read lock */ void up_read(struct rw_semaphore *sem) + __no_context_analysis { rwsem_release(&sem->dep_map, _RET_IP_); __up_read(sem); @@ -1638,6 +1676,7 @@ EXPORT_SYMBOL(up_read); * release a write lock */ void up_write(struct rw_semaphore *sem) + __no_context_analysis { rwsem_release(&sem->dep_map, _RET_IP_); __up_write(sem); @@ -1648,6 +1687,7 @@ EXPORT_SYMBOL(up_write); * downgrade write lock to read lock */ void downgrade_write(struct rw_semaphore *sem) + __no_context_analysis { lock_downgrade(&sem->dep_map, _RET_IP_); __downgrade_write(sem); @@ -1657,6 +1697,7 @@ EXPORT_SYMBOL(downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC void down_read_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1665,6 +1706,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_nested); int down_read_killable_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1679,6 +1721,7 @@ int down_read_killable_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_killable_nested); void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) + __no_context_analysis { might_sleep(); rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); @@ -1687,6 +1730,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) EXPORT_SYMBOL(_down_write_nest_lock); void down_read_non_owner(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); __down_read(sem); @@ -1701,6 +1745,7 @@ void down_read_non_owner(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read_non_owner); void down_write_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1709,6 +1754,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1724,6 +1770,7 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) + __no_context_analysis { DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); __up_read(sem); diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 3ef032e22f7e..74d41433ba13 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -21,7 +21,7 @@ * too. * * The ->count variable represents how many more tasks can acquire this - * semaphore. If it's zero, there may be tasks waiting on the wait_list. + * semaphore. If it's zero, there may be waiters. */ #include <linux/compiler.h> @@ -226,7 +226,7 @@ void __sched up(struct semaphore *sem) hung_task_sem_clear_if_holder(sem); - if (likely(list_empty(&sem->wait_list))) + if (likely(!sem->first_waiter)) sem->count++; else __up(sem, &wake_q); @@ -244,6 +244,21 @@ struct semaphore_waiter { bool up; }; +static inline +void sem_del_waiter(struct semaphore *sem, struct semaphore_waiter *waiter) +{ + if (list_empty(&waiter->list)) { + sem->first_waiter = NULL; + return; + } + + if (sem->first_waiter == waiter) { + sem->first_waiter = list_first_entry(&waiter->list, + struct semaphore_waiter, list); + } + list_del(&waiter->list); +} + /* * Because this function is inlined, the 'state' parameter will be * constant, and thus optimised away by the compiler. Likewise the @@ -252,9 +267,15 @@ struct semaphore_waiter { static inline int __sched ___down_common(struct semaphore *sem, long state, long timeout) { - struct semaphore_waiter waiter; - - list_add_tail(&waiter.list, &sem->wait_list); + struct semaphore_waiter waiter, *first; + + first = sem->first_waiter; + if (first) { + list_add_tail(&waiter.list, &first->list); + } else { + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter = &waiter; + } waiter.task = current; waiter.up = false; @@ -274,11 +295,11 @@ static inline int __sched ___down_common(struct semaphore *sem, long state, } timed_out: - list_del(&waiter.list); + sem_del_waiter(sem, &waiter); return -ETIME; interrupted: - list_del(&waiter.list); + sem_del_waiter(sem, &waiter); return -EINTR; } @@ -321,9 +342,9 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) static noinline void __sched __up(struct semaphore *sem, struct wake_q_head *wake_q) { - struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, - struct semaphore_waiter, list); - list_del(&waiter->list); + struct semaphore_waiter *waiter = sem->first_waiter; + + sem_del_waiter(sem, waiter); waiter->up = true; wake_q_add(wake_q, waiter->task); } diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 7685defd7c52..b42d293da38b 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -64,8 +64,9 @@ EXPORT_PER_CPU_SYMBOL(__mmiowb_state); * time (making _this_ CPU preemptible if possible), and we also signal * towards that other CPU that it should break the lock ASAP. */ -#define BUILD_LOCK_OPS(op, locktype) \ +#define BUILD_LOCK_OPS(op, locktype, lock_ctx_op) \ static void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ + lock_ctx_op(lock) \ { \ for (;;) { \ preempt_disable(); \ @@ -78,6 +79,7 @@ static void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ } \ \ static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ + lock_ctx_op(lock) \ { \ unsigned long flags; \ \ @@ -96,11 +98,13 @@ static unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ } \ \ static void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ + lock_ctx_op(lock) \ { \ _raw_##op##_lock_irqsave(lock); \ } \ \ static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ + lock_ctx_op(lock) \ { \ unsigned long flags; \ \ @@ -123,11 +127,11 @@ static void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ * __[spin|read|write]_lock_irqsave() * __[spin|read|write]_lock_bh() */ -BUILD_LOCK_OPS(spin, raw_spinlock); +BUILD_LOCK_OPS(spin, raw_spinlock, __acquires); #ifndef CONFIG_PREEMPT_RT -BUILD_LOCK_OPS(read, rwlock); -BUILD_LOCK_OPS(write, rwlock); +BUILD_LOCK_OPS(read, rwlock, __acquires_shared); +BUILD_LOCK_OPS(write, rwlock, __acquires); #endif #endif diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 31a785afee6c..016f0db892a5 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -4,24 +4,21 @@ #define MUTEX mutex #define MUTEX_WAITER mutex_waiter +#define WAIT_LOCK wait_lock static inline struct mutex_waiter * __ww_waiter_first(struct mutex *lock) + __must_hold(&lock->wait_lock) { - struct mutex_waiter *w; - - w = list_first_entry(&lock->wait_list, struct mutex_waiter, list); - if (list_entry_is_head(w, &lock->wait_list, list)) - return NULL; - - return w; + return lock->first_waiter; } static inline struct mutex_waiter * __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) + __must_hold(&lock->wait_lock) { w = list_next_entry(w, list); - if (list_entry_is_head(w, &lock->wait_list, list)) + if (lock->first_waiter == w) return NULL; return w; @@ -29,9 +26,10 @@ __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) static inline struct mutex_waiter * __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) + __must_hold(&lock->wait_lock) { w = list_prev_entry(w, list); - if (list_entry_is_head(w, &lock->wait_list, list)) + if (lock->first_waiter == w) return NULL; return w; @@ -39,23 +37,20 @@ __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) static inline struct mutex_waiter * __ww_waiter_last(struct mutex *lock) + __must_hold(&lock->wait_lock) { - struct mutex_waiter *w; - - w = list_last_entry(&lock->wait_list, struct mutex_waiter, list); - if (list_entry_is_head(w, &lock->wait_list, list)) - return NULL; + struct mutex_waiter *w = lock->first_waiter; + if (w) + w = list_prev_entry(w, list); return w; } static inline void __ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos) + __must_hold(&lock->wait_lock) { - struct list_head *p = &lock->wait_list; - if (pos) - p = &pos->list; - __mutex_add_waiter(lock, waiter, p); + __mutex_add_waiter(lock, waiter, pos); } static inline struct task_struct * @@ -71,16 +66,19 @@ __ww_mutex_has_waiters(struct mutex *lock) } static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags) + __acquires(&lock->wait_lock) { raw_spin_lock_irqsave(&lock->wait_lock, *flags); } static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags) + __releases(&lock->wait_lock) { raw_spin_unlock_irqrestore(&lock->wait_lock, *flags); } static inline void lockdep_assert_wait_lock_held(struct mutex *lock) + __must_hold(&lock->wait_lock) { lockdep_assert_held(&lock->wait_lock); } @@ -89,9 +87,11 @@ static inline void lockdep_assert_wait_lock_held(struct mutex *lock) #define MUTEX rt_mutex #define MUTEX_WAITER rt_mutex_waiter +#define WAIT_LOCK rtmutex.wait_lock static inline struct rt_mutex_waiter * __ww_waiter_first(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); if (!n) @@ -119,6 +119,7 @@ __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) static inline struct rt_mutex_waiter * __ww_waiter_last(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); if (!n) @@ -140,21 +141,25 @@ __ww_mutex_owner(struct rt_mutex *lock) static inline bool __ww_mutex_has_waiters(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { return rt_mutex_has_waiters(&lock->rtmutex); } static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags) + __acquires(&lock->rtmutex.wait_lock) { raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags); } static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags) + __releases(&lock->rtmutex.wait_lock) { raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags); } static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { lockdep_assert_held(&lock->rtmutex.wait_lock); } @@ -285,11 +290,11 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, debug_mutex_wake_waiter(lock, waiter); #endif /* - * When waking up the task to die, be sure to clear the - * blocked_on pointer. Otherwise we can see circular - * blocked_on relationships that can't resolve. + * When waking up the task to die, be sure to set the + * blocked_on to PROXY_WAKING. Otherwise we can see + * circular blocked_on relationships that can't resolve. */ - __clear_task_blocked_on(waiter->task, lock); + set_task_blocked_on_waking(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -307,6 +312,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct ww_acquire_ctx *hold_ctx, struct wake_q_head *wake_q) + __must_hold(&lock->WAIT_LOCK) { struct task_struct *owner = __ww_mutex_owner(lock); @@ -339,15 +345,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock, */ if (owner != current) { /* - * When waking up the task to wound, be sure to clear the - * blocked_on pointer. Otherwise we can see circular - * blocked_on relationships that can't resolve. + * When waking up the task to wound, be sure to set the + * blocked_on to PROXY_WAKING. Otherwise we can see + * circular blocked_on relationships that can't resolve. * * NOTE: We pass NULL here instead of lock, because we * are waking the mutex owner, who may be currently * blocked on a different mutex. */ - __clear_task_blocked_on(owner, NULL); + set_task_blocked_on_waking(owner, NULL); wake_q_add(wake_q, owner); } return true; @@ -371,6 +377,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, static void __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) + __must_hold(&lock->WAIT_LOCK) { struct MUTEX_WAITER *cur; @@ -397,6 +404,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { DEFINE_WAKE_Q(wake_q); unsigned long flags; + bool has_waiters; ww_mutex_lock_acquired(lock, ctx); @@ -418,7 +426,8 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx * and/or !empty list. */ - if (likely(!__ww_mutex_has_waiters(&lock->base))) + has_waiters = data_race(__ww_mutex_has_waiters(&lock->base)); + if (likely(!has_waiters)) return; /* @@ -464,6 +473,7 @@ __ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) static inline int __ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, struct ww_acquire_ctx *ctx) + __must_hold(&lock->WAIT_LOCK) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); @@ -514,6 +524,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) + __must_hold(&lock->WAIT_LOCK) { struct MUTEX_WAITER *cur, *pos = NULL; bool is_wait_die; diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c index c7196de838ed..e07fb3b96bc3 100644 --- a/kernel/locking/ww_rt_mutex.c +++ b/kernel/locking/ww_rt_mutex.c @@ -90,6 +90,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) EXPORT_SYMBOL(ww_mutex_lock_interruptible); void __sched ww_mutex_unlock(struct ww_mutex *lock) + __no_context_analysis { struct rt_mutex *rtm = &lock->base; diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index be74917802ad..43b1bb01fd27 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -169,9 +169,10 @@ config MODVERSIONS make them incompatible with the kernel you are running. If unsure, say N. +if MODVERSIONS + choice prompt "Module versioning implementation" - depends on MODVERSIONS help Select the tool used to calculate symbol versions for modules. @@ -206,7 +207,7 @@ endchoice config ASM_MODVERSIONS bool - default HAVE_ASM_MODVERSIONS && MODVERSIONS + default HAVE_ASM_MODVERSIONS help This enables module versioning for exported symbols also from assembly. This can be enabled only when the target architecture @@ -214,7 +215,6 @@ config ASM_MODVERSIONS config EXTENDED_MODVERSIONS bool "Extended Module Versioning Support" - depends on MODVERSIONS help This enables extended MODVERSIONs support, allowing long symbol names to be versioned. @@ -224,7 +224,6 @@ config EXTENDED_MODVERSIONS config BASIC_MODVERSIONS bool "Basic Module Versioning Support" - depends on MODVERSIONS default y help This enables basic MODVERSIONS support, allowing older tools or @@ -237,6 +236,8 @@ config BASIC_MODVERSIONS This is enabled by default when MODVERSIONS are enabled. If unsure, say Y. +endif # MODVERSIONS + config MODULE_SRCVERSION_ALL bool "Source checksum for all modules" help @@ -277,10 +278,11 @@ config MODULE_SIG_FORCE Reject unsigned modules or signed modules for which we don't have a key. Without this, such modules will simply taint the kernel. +if MODULE_SIG || IMA_APPRAISE_MODSIG + config MODULE_SIG_ALL bool "Automatically sign all modules" default y - depends on MODULE_SIG || IMA_APPRAISE_MODSIG help Sign all modules during make modules_install. Without this option, modules must be signed manually, using the scripts/sign-file tool. @@ -290,7 +292,6 @@ comment "Do not forget to sign required modules with scripts/sign-file" choice prompt "Hash algorithm to sign modules" - depends on MODULE_SIG || IMA_APPRAISE_MODSIG default MODULE_SIG_SHA512 help This determines which sort of hashing algorithm will be used during @@ -327,7 +328,6 @@ endchoice config MODULE_SIG_HASH string - depends on MODULE_SIG || IMA_APPRAISE_MODSIG default "sha256" if MODULE_SIG_SHA256 default "sha384" if MODULE_SIG_SHA384 default "sha512" if MODULE_SIG_SHA512 @@ -335,6 +335,8 @@ config MODULE_SIG_HASH default "sha3-384" if MODULE_SIG_SHA3_384 default "sha3-512" if MODULE_SIG_SHA3_512 +endif # MODULE_SIG || IMA_APPRAISE_MODSIG + config MODULE_COMPRESS bool "Module compression" help @@ -350,9 +352,10 @@ config MODULE_COMPRESS If unsure, say N. +if MODULE_COMPRESS + choice prompt "Module compression type" - depends on MODULE_COMPRESS help Choose the supported algorithm for module compression. @@ -379,7 +382,6 @@ endchoice config MODULE_COMPRESS_ALL bool "Automatically compress all modules" default y - depends on MODULE_COMPRESS help Compress all modules during 'make modules_install'. @@ -389,7 +391,6 @@ config MODULE_COMPRESS_ALL config MODULE_DECOMPRESS bool "Support in-kernel module decompression" - depends on MODULE_COMPRESS select ZLIB_INFLATE if MODULE_COMPRESS_GZIP select XZ_DEC if MODULE_COMPRESS_XZ select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD @@ -400,6 +401,8 @@ config MODULE_DECOMPRESS If unsure, say N. +endif # MODULE_COMPRESS + config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS bool "Allow loading of modules with missing namespace imports" help diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 618202578b42..061161cc79d9 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -53,10 +53,8 @@ extern const size_t modinfo_attrs_count; /* Provided by the linker */ extern const struct kernel_symbol __start___ksymtab[]; extern const struct kernel_symbol __stop___ksymtab[]; -extern const struct kernel_symbol __start___ksymtab_gpl[]; -extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const u32 __start___kcrctab[]; -extern const u32 __start___kcrctab_gpl[]; +extern const u8 __start___kflagstab[]; #define KMOD_PATH_LEN 256 extern char modprobe_path[]; diff --git a/kernel/module/main.c b/kernel/module/main.c index 2bac4c7cd019..46dd8d25a605 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -11,6 +11,7 @@ #include <linux/extable.h> #include <linux/moduleloader.h> #include <linux/module_signature.h> +#include <linux/module_symbol.h> #include <linux/trace_events.h> #include <linux/init.h> #include <linux/kallsyms.h> @@ -87,7 +88,7 @@ struct mod_tree_root mod_tree __cacheline_aligned = { struct symsearch { const struct kernel_symbol *start, *stop; const u32 *crcs; - enum mod_license license; + const u8 *flagstab; }; /* @@ -364,19 +365,21 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, struct find_symbol_arg *fsa) { struct kernel_symbol *sym; - - if (!fsa->gplok && syms->license == GPL_ONLY) - return false; + u8 sym_flags; sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, sizeof(struct kernel_symbol), cmp_name); if (!sym) return false; + sym_flags = *(syms->flagstab + (sym - syms->start)); + if (!fsa->gplok && (sym_flags & KSYM_FLAG_GPL_ONLY)) + return false; + fsa->owner = owner; fsa->crc = symversion(syms->crcs, sym - syms->start); fsa->sym = sym; - fsa->license = syms->license; + fsa->license = (sym_flags & KSYM_FLAG_GPL_ONLY) ? GPL_ONLY : NOT_GPL_ONLY; return true; } @@ -387,36 +390,31 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, */ bool find_symbol(struct find_symbol_arg *fsa) { - static const struct symsearch arr[] = { - { __start___ksymtab, __stop___ksymtab, __start___kcrctab, - NOT_GPL_ONLY }, - { __start___ksymtab_gpl, __stop___ksymtab_gpl, - __start___kcrctab_gpl, - GPL_ONLY }, + const struct symsearch syms = { + .start = __start___ksymtab, + .stop = __stop___ksymtab, + .crcs = __start___kcrctab, + .flagstab = __start___kflagstab, }; struct module *mod; - unsigned int i; - for (i = 0; i < ARRAY_SIZE(arr); i++) - if (find_exported_symbol_in_section(&arr[i], NULL, fsa)) - return true; + if (find_exported_symbol_in_section(&syms, NULL, fsa)) + return true; list_for_each_entry_rcu(mod, &modules, list, lockdep_is_held(&module_mutex)) { - struct symsearch arr[] = { - { mod->syms, mod->syms + mod->num_syms, mod->crcs, - NOT_GPL_ONLY }, - { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, - mod->gpl_crcs, - GPL_ONLY }, + const struct symsearch syms = { + .start = mod->syms, + .stop = mod->syms + mod->num_syms, + .crcs = mod->crcs, + .flagstab = mod->flagstab, }; if (mod->state == MODULE_STATE_UNFORMED) continue; - for (i = 0; i < ARRAY_SIZE(arr); i++) - if (find_exported_symbol_in_section(&arr[i], mod, fsa)) - return true; + if (find_exported_symbol_in_section(&syms, mod, fsa)) + return true; } pr_debug("Failed to find symbol %s\n", fsa->name); @@ -607,6 +605,36 @@ static const struct module_attribute modinfo_##field = { \ MODINFO_ATTR(version); MODINFO_ATTR(srcversion); +static void setup_modinfo_import_ns(struct module *mod, const char *s) +{ + mod->imported_namespaces = NULL; +} + +static ssize_t show_modinfo_import_ns(const struct module_attribute *mattr, + struct module_kobject *mk, char *buffer) +{ + return sysfs_emit(buffer, "%s\n", mk->mod->imported_namespaces); +} + +static int modinfo_import_ns_exists(struct module *mod) +{ + return mod->imported_namespaces != NULL; +} + +static void free_modinfo_import_ns(struct module *mod) +{ + kfree(mod->imported_namespaces); + mod->imported_namespaces = NULL; +} + +static const struct module_attribute modinfo_import_ns = { + .attr = { .name = "import_ns", .mode = 0444 }, + .show = show_modinfo_import_ns, + .setup = setup_modinfo_import_ns, + .test = modinfo_import_ns_exists, + .free = free_modinfo_import_ns, +}; + static struct { char name[MODULE_NAME_LEN]; char taints[MODULE_FLAGS_BUF_SIZE]; @@ -1058,6 +1086,7 @@ const struct module_attribute *const modinfo_attrs[] = { &module_uevent, &modinfo_version, &modinfo_srcversion, + &modinfo_import_ns, &modinfo_initstate, &modinfo_coresize, #ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC @@ -1408,7 +1437,7 @@ static void free_module(struct module *mod) module_unload_free(mod); /* Free any allocated parameters. */ - destroy_params(mod->kp, mod->num_kp); + module_destroy_params(mod->kp, mod->num_kp); if (is_livepatch_module(mod)) free_module_elf(mod); @@ -1466,29 +1495,17 @@ EXPORT_SYMBOL_GPL(__symbol_get); */ static int verify_exported_symbols(struct module *mod) { - unsigned int i; const struct kernel_symbol *s; - struct { - const struct kernel_symbol *sym; - unsigned int num; - } arr[] = { - { mod->syms, mod->num_syms }, - { mod->gpl_syms, mod->num_gpl_syms }, - }; - - for (i = 0; i < ARRAY_SIZE(arr); i++) { - for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - struct find_symbol_arg fsa = { - .name = kernel_symbol_name(s), - .gplok = true, - }; - if (find_symbol(&fsa)) { - pr_err("%s: exports duplicate symbol %s" - " (owned by %s)\n", - mod->name, kernel_symbol_name(s), - module_name(fsa.owner)); - return -ENOEXEC; - } + for (s = mod->syms; s < mod->syms + mod->num_syms; s++) { + struct find_symbol_arg fsa = { + .name = kernel_symbol_name(s), + .gplok = true, + }; + if (find_symbol(&fsa)) { + pr_err("%s: exports duplicate symbol %s (owned by %s)\n", + mod->name, kernel_symbol_name(s), + module_name(fsa.owner)); + return -ENOEXEC; } } return 0; @@ -1568,6 +1585,13 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) break; default: + if (sym[i].st_shndx >= info->hdr->e_shnum) { + pr_err("%s: Symbol %s has an invalid section index %u (max %u)\n", + mod->name, name, sym[i].st_shndx, info->hdr->e_shnum - 1); + ret = -ENOEXEC; + break; + } + /* Divert to percpu allocation if a percpu var. */ if (sym[i].st_shndx == info->index.pcpu) secbase = (unsigned long)mod_percpu(mod); @@ -1753,11 +1777,43 @@ static void module_license_taint_check(struct module *mod, const char *license) } } +static int copy_modinfo_import_ns(struct module *mod, struct load_info *info) +{ + char *ns; + size_t len, total_len = 0; + char *buf, *p; + + for_each_modinfo_entry(ns, info, "import_ns") + total_len += strlen(ns) + 1; + + if (!total_len) { + mod->imported_namespaces = NULL; + return 0; + } + + buf = kmalloc(total_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + p = buf; + for_each_modinfo_entry(ns, info, "import_ns") { + len = strlen(ns); + memcpy(p, ns, len); + p += len; + *p++ = '\n'; + } + /* Replace trailing newline with null terminator. */ + *(p - 1) = '\0'; + + mod->imported_namespaces = buf; + return 0; +} + static int setup_modinfo(struct module *mod, struct load_info *info) { const struct module_attribute *attr; char *imported_namespace; - int i; + int i, err; for (i = 0; (attr = modinfo_attrs[i]); i++) { if (attr->setup) @@ -1776,6 +1832,10 @@ static int setup_modinfo(struct module *mod, struct load_info *info) } } + err = copy_modinfo_import_ns(mod, info); + if (err) + return err; + return 0; } @@ -2603,10 +2663,14 @@ static int find_module_sections(struct module *mod, struct load_info *info) mod->syms = section_objs(info, "__ksymtab", sizeof(*mod->syms), &mod->num_syms); mod->crcs = section_addr(info, "__kcrctab"); - mod->gpl_syms = section_objs(info, "__ksymtab_gpl", - sizeof(*mod->gpl_syms), - &mod->num_gpl_syms); - mod->gpl_crcs = section_addr(info, "__kcrctab_gpl"); + mod->flagstab = section_addr(info, "__kflagstab"); + + if (section_addr(info, "__ksymtab_gpl")) + pr_warn("%s: ignoring obsolete section __ksymtab_gpl\n", + mod->name); + if (section_addr(info, "__kcrctab_gpl")) + pr_warn("%s: ignoring obsolete section __kcrctab_gpl\n", + mod->name); #ifdef CONFIG_CONSTRUCTORS mod->ctors = section_objs(info, ".ctors", @@ -2810,11 +2874,14 @@ out_err: return ret; } -static int check_export_symbol_versions(struct module *mod) +static int check_export_symbol_sections(struct module *mod) { + if (mod->num_syms && !mod->flagstab) { + pr_err("%s: no flags for exported symbols\n", mod->name); + return -ENOEXEC; + } #ifdef CONFIG_MODVERSIONS - if ((mod->num_syms && !mod->crcs) || - (mod->num_gpl_syms && !mod->gpl_crcs)) { + if (mod->num_syms && !mod->crcs) { return try_to_force_load(mod, "no versions for exported symbols"); } @@ -3038,15 +3105,19 @@ static noinline int do_init_module(struct module *mod) if (mod->init != NULL) ret = do_one_initcall(mod->init); if (ret < 0) { + /* + * -EEXIST is reserved by [f]init_module() to signal to userspace that + * a module with this name is already loaded. Use something else if the + * module itself is returning that. + */ + if (ret == -EEXIST) + ret = -EBUSY; + goto fail_free_freeinit; } - if (ret > 0) { - pr_warn("%s: '%s'->init suspiciously returned %d, it should " - "follow 0/-E convention\n" - "%s: loading module anyway...\n", - __func__, mod->name, ret, __func__); - dump_stack(); - } + if (ret > 0) + pr_warn("%s: init suspiciously returned %d, it should follow 0/-E convention\n", + mod->name, ret); /* Now it's a first class citizen! */ mod->state = MODULE_STATE_LIVE; @@ -3427,7 +3498,7 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto free_unload; - err = check_export_symbol_versions(mod); + err = check_export_symbol_sections(mod); if (err) goto free_unload; @@ -3512,7 +3583,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod_sysfs_teardown(mod); coming_cleanup: mod->state = MODULE_STATE_GOING; - destroy_params(mod->kp, mod->num_kp); + module_destroy_params(mod->kp, mod->num_kp); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); klp_module_going(mod); @@ -3544,12 +3615,6 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_unlock(&module_mutex); free_module: mod_stat_bump_invalid(info, flags); - /* Free lock-classes; relies on the preceding sync_rcu() */ - for_class_mod_mem_type(type, core_data) { - lockdep_free_key_range(mod->mem[type].base, - mod->mem[type].size); - } - module_memory_restore_rox(mod); module_deallocate(mod, info); free_copy: diff --git a/kernel/module/signing.c b/kernel/module/signing.c index a2ff4242e623..590ba29c85ab 100644 --- a/kernel/module/signing.c +++ b/kernel/module/signing.c @@ -70,7 +70,7 @@ int mod_verify_sig(const void *mod, struct load_info *info) int module_sig_check(struct load_info *info, int flags) { int err = -ENODATA; - const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const unsigned long markerlen = sizeof(MODULE_SIGNATURE_MARKER) - 1; const char *reason; const void *mod = info->hdr; bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | @@ -81,7 +81,7 @@ int module_sig_check(struct load_info *info, int flags) */ if (!mangled_module && info->len > markerlen && - memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { + memcmp(mod + info->len - markerlen, MODULE_SIGNATURE_MARKER, markerlen) == 0) { /* We truncate the module to discard the signature */ info->len -= markerlen; err = mod_verify_sig(mod, info); diff --git a/kernel/module_signature.c b/kernel/module_signature.c index 00132d12487c..a0eee2fe4368 100644 --- a/kernel/module_signature.c +++ b/kernel/module_signature.c @@ -24,7 +24,7 @@ int mod_check_sig(const struct module_signature *ms, size_t file_len, if (be32_to_cpu(ms->sig_len) >= file_len - sizeof(*ms)) return -EBADMSG; - if (ms->id_type != PKEY_ID_PKCS7) { + if (ms->id_type != MODULE_SIGNATURE_TYPE_PKCS7) { pr_err("%s: not signed with expected PKCS#7 message\n", name); return -ENOPKG; diff --git a/kernel/nscommon.c b/kernel/nscommon.c index bdc3c86231d3..3166c1fd844a 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -309,3 +309,9 @@ void __ns_ref_active_get(struct ns_common *ns) return; } } + +bool may_see_all_namespaces(void) +{ + return (task_active_pid_ns(current) == &init_pid_ns) && + ns_capable_noaudit(init_pid_ns.user_ns, CAP_SYS_ADMIN); +} diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 259c4b4f1eeb..d9d3d5973bf5 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/nsproxy.h> +#include <linux/ns/ns_common_types.h> #include <linux/init_task.h> #include <linux/mnt_namespace.h> #include <linux/utsname.h> @@ -95,7 +96,8 @@ static struct nsproxy *create_new_namespaces(u64 flags, if (!new_nsp) return ERR_PTR(-ENOMEM); - new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs); + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, + user_ns, new_fs); if (IS_ERR(new_nsp->mnt_ns)) { err = PTR_ERR(new_nsp->mnt_ns); goto out_ns; @@ -170,9 +172,7 @@ int copy_namespaces(u64 flags, struct task_struct *tsk) struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; - if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWPID | CLONE_NEWNET | - CLONE_NEWCGROUP | CLONE_NEWTIME)))) { + if (likely(!(flags & (CLONE_NS_ALL & ~CLONE_NEWUSER)))) { if ((flags & CLONE_VM) || likely(old_ns->time_ns_for_children == old_ns->time_ns)) { get_nsproxy(old_ns); @@ -212,18 +212,26 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) { struct user_namespace *user_ns; + u64 flags = unshare_flags; int err = 0; - if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP | - CLONE_NEWTIME))) + if (!(flags & (CLONE_NS_ALL & ~CLONE_NEWUSER))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, + /* + * Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases + * CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS. + */ + if (flags & UNSHARE_EMPTY_MNTNS) { + flags &= ~(u64)UNSHARE_EMPTY_MNTNS; + flags |= CLONE_EMPTY_MNTNS; + } + + *new_nsp = create_new_namespaces(flags, current, user_ns, new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); @@ -292,9 +300,7 @@ int exec_task_namespaces(void) static int check_setns_flags(unsigned long flags) { - if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER | - CLONE_NEWPID | CLONE_NEWCGROUP))) + if (!flags || (flags & ~CLONE_NS_ALL)) return -EINVAL; #ifndef CONFIG_USER_NS diff --git a/kernel/nstree.c b/kernel/nstree.c index f36c59e6951d..6d12e5900ac0 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -515,32 +515,11 @@ static inline bool __must_check ns_requested(const struct klistns *kls, static inline bool __must_check may_list_ns(const struct klistns *kls, struct ns_common *ns) { - if (kls->user_ns) { - if (kls->userns_capable) - return true; - } else { - struct ns_common *owner; - struct user_namespace *user_ns; - - owner = ns_owner(ns); - if (owner) - user_ns = to_user_ns(owner); - else - user_ns = &init_user_ns; - if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN)) - return true; - } - - if (is_current_namespace(ns)) + if (kls->user_ns && kls->userns_capable) return true; - - if (ns->ns_type != CLONE_NEWUSER) - return false; - - if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN)) + if (is_current_namespace(ns)) return true; - - return false; + return may_see_all_namespaces(); } static inline void ns_put(struct ns_common *ns) @@ -600,7 +579,7 @@ static ssize_t do_listns_userns(struct klistns *kls) ret = 0; head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head; - kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN); + kls->userns_capable = may_see_all_namespaces(); rcu_read_lock(); diff --git a/kernel/padata.c b/kernel/padata.c index 8657e6e0c224..0d3ea1b68b1f 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -535,7 +535,8 @@ static void padata_init_reorder_list(struct parallel_data *pd) } /* Allocate and initialize the internal cpumask dependend resources. */ -static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) +static struct parallel_data *padata_alloc_pd(struct padata_shell *ps, + int offlining_cpu) { struct padata_instance *pinst = ps->pinst; struct parallel_data *pd; @@ -561,6 +562,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) cpumask_and(pd->cpumask.pcpu, pinst->cpumask.pcpu, cpu_online_mask); cpumask_and(pd->cpumask.cbcpu, pinst->cpumask.cbcpu, cpu_online_mask); + if (offlining_cpu >= 0) { + __cpumask_clear_cpu(offlining_cpu, pd->cpumask.pcpu); + __cpumask_clear_cpu(offlining_cpu, pd->cpumask.cbcpu); + } padata_init_reorder_list(pd); padata_init_squeues(pd); @@ -607,11 +612,11 @@ static void __padata_stop(struct padata_instance *pinst) } /* Replace the internal control structure with a new one. */ -static int padata_replace_one(struct padata_shell *ps) +static int padata_replace_one(struct padata_shell *ps, int offlining_cpu) { struct parallel_data *pd_new; - pd_new = padata_alloc_pd(ps); + pd_new = padata_alloc_pd(ps, offlining_cpu); if (!pd_new) return -ENOMEM; @@ -621,7 +626,7 @@ static int padata_replace_one(struct padata_shell *ps) return 0; } -static int padata_replace(struct padata_instance *pinst) +static int padata_replace(struct padata_instance *pinst, int offlining_cpu) { struct padata_shell *ps; int err = 0; @@ -629,7 +634,7 @@ static int padata_replace(struct padata_instance *pinst) pinst->flags |= PADATA_RESET; list_for_each_entry(ps, &pinst->pslist, list) { - err = padata_replace_one(ps); + err = padata_replace_one(ps, offlining_cpu); if (err) break; } @@ -646,9 +651,21 @@ static int padata_replace(struct padata_instance *pinst) /* If cpumask contains no active cpu, we mark the instance as invalid. */ static bool padata_validate_cpumask(struct padata_instance *pinst, - const struct cpumask *cpumask) + const struct cpumask *cpumask, + int offlining_cpu) { - if (!cpumask_intersects(cpumask, cpu_online_mask)) { + cpumask_copy(pinst->validate_cpumask, cpu_online_mask); + + /* + * @offlining_cpu is still in cpu_online_mask, so remove it here for + * validation. Using a sub-CPUHP_TEARDOWN_CPU hotplug state where + * @offlining_cpu wouldn't be in the online mask doesn't work because + * padata_cpu_offline() can fail but such a state doesn't allow failure. + */ + if (offlining_cpu >= 0) + __cpumask_clear_cpu(offlining_cpu, pinst->validate_cpumask); + + if (!cpumask_intersects(cpumask, pinst->validate_cpumask)) { pinst->flags |= PADATA_INVALID; return false; } @@ -664,13 +681,13 @@ static int __padata_set_cpumasks(struct padata_instance *pinst, int valid; int err; - valid = padata_validate_cpumask(pinst, pcpumask); + valid = padata_validate_cpumask(pinst, pcpumask, -1); if (!valid) { __padata_stop(pinst); goto out_replace; } - valid = padata_validate_cpumask(pinst, cbcpumask); + valid = padata_validate_cpumask(pinst, cbcpumask, -1); if (!valid) __padata_stop(pinst); @@ -678,7 +695,7 @@ out_replace: cpumask_copy(pinst->cpumask.pcpu, pcpumask); cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); - err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst); + err = padata_setup_cpumasks(pinst) ?: padata_replace(pinst, -1); if (valid) __padata_start(pinst); @@ -730,36 +747,6 @@ EXPORT_SYMBOL(padata_set_cpumask); #ifdef CONFIG_HOTPLUG_CPU -static int __padata_add_cpu(struct padata_instance *pinst, int cpu) -{ - int err = 0; - - if (cpumask_test_cpu(cpu, cpu_online_mask)) { - err = padata_replace(pinst); - - if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && - padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) - __padata_start(pinst); - } - - return err; -} - -static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) -{ - int err = 0; - - if (!cpumask_test_cpu(cpu, cpu_online_mask)) { - if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || - !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) - __padata_stop(pinst); - - err = padata_replace(pinst); - } - - return err; -} - static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) { return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || @@ -771,27 +758,39 @@ static int padata_cpu_online(unsigned int cpu, struct hlist_node *node) struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, cpu_online_node); + pinst = hlist_entry_safe(node, struct padata_instance, cpuhp_node); if (!pinst_has_cpu(pinst, cpu)) return 0; mutex_lock(&pinst->lock); - ret = __padata_add_cpu(pinst, cpu); + + ret = padata_replace(pinst, -1); + + if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu, -1) && + padata_validate_cpumask(pinst, pinst->cpumask.cbcpu, -1)) + __padata_start(pinst); + mutex_unlock(&pinst->lock); return ret; } -static int padata_cpu_dead(unsigned int cpu, struct hlist_node *node) +static int padata_cpu_offline(unsigned int cpu, struct hlist_node *node) { struct padata_instance *pinst; int ret; - pinst = hlist_entry_safe(node, struct padata_instance, cpu_dead_node); + pinst = hlist_entry_safe(node, struct padata_instance, cpuhp_node); if (!pinst_has_cpu(pinst, cpu)) return 0; mutex_lock(&pinst->lock); - ret = __padata_remove_cpu(pinst, cpu); + + if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu, cpu) || + !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu, cpu)) + __padata_stop(pinst); + + ret = padata_replace(pinst, cpu); + mutex_unlock(&pinst->lock); return ret; } @@ -802,15 +801,14 @@ static enum cpuhp_state hp_online; static void __padata_free(struct padata_instance *pinst) { #ifdef CONFIG_HOTPLUG_CPU - cpuhp_state_remove_instance_nocalls(CPUHP_PADATA_DEAD, - &pinst->cpu_dead_node); - cpuhp_state_remove_instance_nocalls(hp_online, &pinst->cpu_online_node); + cpuhp_state_remove_instance_nocalls(hp_online, &pinst->cpuhp_node); #endif WARN_ON(!list_empty(&pinst->pslist)); free_cpumask_var(pinst->cpumask.pcpu); free_cpumask_var(pinst->cpumask.cbcpu); + free_cpumask_var(pinst->validate_cpumask); destroy_workqueue(pinst->serial_wq); destroy_workqueue(pinst->parallel_wq); kfree(pinst); @@ -971,10 +969,10 @@ struct padata_instance *padata_alloc(const char *name) if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) goto err_free_serial_wq; - if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { - free_cpumask_var(pinst->cpumask.pcpu); - goto err_free_serial_wq; - } + if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) + goto err_free_p_mask; + if (!alloc_cpumask_var(&pinst->validate_cpumask, GFP_KERNEL)) + goto err_free_cb_mask; INIT_LIST_HEAD(&pinst->pslist); @@ -982,7 +980,7 @@ struct padata_instance *padata_alloc(const char *name) cpumask_copy(pinst->cpumask.cbcpu, cpu_possible_mask); if (padata_setup_cpumasks(pinst)) - goto err_free_masks; + goto err_free_v_mask; __padata_start(pinst); @@ -991,18 +989,19 @@ struct padata_instance *padata_alloc(const char *name) #ifdef CONFIG_HOTPLUG_CPU cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, - &pinst->cpu_online_node); - cpuhp_state_add_instance_nocalls_cpuslocked(CPUHP_PADATA_DEAD, - &pinst->cpu_dead_node); + &pinst->cpuhp_node); #endif cpus_read_unlock(); return pinst; -err_free_masks: - free_cpumask_var(pinst->cpumask.pcpu); +err_free_v_mask: + free_cpumask_var(pinst->validate_cpumask); +err_free_cb_mask: free_cpumask_var(pinst->cpumask.cbcpu); +err_free_p_mask: + free_cpumask_var(pinst->cpumask.pcpu); err_free_serial_wq: destroy_workqueue(pinst->serial_wq); err_put_cpus: @@ -1045,7 +1044,7 @@ struct padata_shell *padata_alloc_shell(struct padata_instance *pinst) ps->pinst = pinst; cpus_read_lock(); - pd = padata_alloc_pd(ps); + pd = padata_alloc_pd(ps, -1); cpus_read_unlock(); if (!pd) @@ -1094,31 +1093,24 @@ void __init padata_init(void) int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online", - padata_cpu_online, NULL); + padata_cpu_online, padata_cpu_offline); if (ret < 0) goto err; hp_online = ret; - - ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead", - NULL, padata_cpu_dead); - if (ret < 0) - goto remove_online_state; #endif possible_cpus = num_possible_cpus(); padata_works = kmalloc_objs(struct padata_work, possible_cpus); if (!padata_works) - goto remove_dead_state; + goto remove_online_state; for (i = 0; i < possible_cpus; ++i) list_add(&padata_works[i].pw_list, &padata_free_works); return; -remove_dead_state: -#ifdef CONFIG_HOTPLUG_CPU - cpuhp_remove_multi_state(CPUHP_PADATA_DEAD); remove_online_state: +#ifdef CONFIG_HOTPLUG_CPU cpuhp_remove_multi_state(hp_online); err: #endif diff --git a/kernel/panic.c b/kernel/panic.c index c78600212b6c..20feada5319d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -801,6 +801,8 @@ EXPORT_SYMBOL(panic); * Documentation/admin-guide/tainted-kernels.rst, including its * small shell script that prints the TAINT_FLAGS_COUNT bits of * /proc/sys/kernel/tainted. + * + * Also, update INIT_TAINT_BUF_MAX below. */ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { TAINT_FLAG(PROPRIETARY_MODULE, 'P', 'G'), @@ -854,15 +856,54 @@ static void print_tainted_seq(struct seq_buf *s, bool verbose) } } +/* The initial buffer can accommodate all taint flags in verbose + * mode, with some headroom. Once the allocator is available, the + * exact size is allocated dynamically; the initial buffer remains + * as a fallback if allocation fails. + * + * The verbose taint string currently requires up to 327 characters. + */ +#define INIT_TAINT_BUF_MAX 350 + +static char init_taint_buf[INIT_TAINT_BUF_MAX] __initdata; +static char *taint_buf __refdata = init_taint_buf; +static size_t taint_buf_size = INIT_TAINT_BUF_MAX; + +static __init int alloc_taint_buf(void) +{ + int i; + char *buf; + size_t size = 0; + + size += sizeof("Tainted: ") - 1; + for (i = 0; i < TAINT_FLAGS_COUNT; i++) { + size += 2; /* For ", " */ + size += 4; /* For "[%c]=" */ + size += strlen(taint_flags[i].desc); + } + + size += 1; /* For NULL terminator */ + + buf = kmalloc(size, GFP_KERNEL); + + if (!buf) { + panic("Failed to allocate taint string buffer"); + } + + taint_buf = buf; + taint_buf_size = size; + + return 0; +} +postcore_initcall(alloc_taint_buf); + static const char *_print_tainted(bool verbose) { - /* FIXME: what should the size be? */ - static char buf[sizeof(taint_flags)]; struct seq_buf s; BUILD_BUG_ON(ARRAY_SIZE(taint_flags) != TAINT_FLAGS_COUNT); - seq_buf_init(&s, buf, sizeof(buf)); + seq_buf_init(&s, taint_buf, taint_buf_size); print_tainted_seq(&s, verbose); diff --git a/kernel/params.c b/kernel/params.c index 7188a12dbe86..74d620bc2521 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -161,7 +161,7 @@ static int parse_one(char *param, char *parse_args(const char *doing, char *args, const struct kernel_param *params, - unsigned num, + unsigned int num, s16 min_level, s16 max_level, void *arg, parse_unknown_fn unknown) @@ -745,15 +745,6 @@ void module_param_sysfs_remove(struct module *mod) } #endif -void destroy_params(const struct kernel_param *params, unsigned num) -{ - unsigned int i; - - for (i = 0; i < num; i++) - if (params[i].ops->free) - params[i].ops->free(params[i].arg); -} - struct module_kobject * __init_or_module lookup_or_create_module_kobject(const char *name) { @@ -985,3 +976,21 @@ static int __init param_sysfs_builtin_init(void) late_initcall(param_sysfs_builtin_init); #endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_MODULES + +/* + * module_destroy_params - free all parameters for one module + * @params: module parameters (array) + * @num: number of module parameters + */ +void module_destroy_params(const struct kernel_param *params, unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) + if (params[i].ops->free) + params[i].ops->free(params[i].arg); +} + +#endif /* CONFIG_MODULES */ diff --git a/kernel/pid.c b/kernel/pid.c index 3b96571d0fe6..fd5c2d4aa349 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -128,12 +128,11 @@ void free_pid(struct pid *pid) * is the reaper wake up the reaper. The reaper * may be sleeping in zap_pid_ns_processes(). */ - wake_up_process(ns->child_reaper); + wake_up_process(READ_ONCE(ns->child_reaper)); break; case PIDNS_ADDING: - /* Handle a fork failure of the first process */ - WARN_ON(ns->child_reaper); - ns->pid_allocated = 0; + /* Only possible if the 1st fork fails */ + WARN_ON(READ_ONCE(ns->child_reaper)); break; } @@ -215,12 +214,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, retval = -EINVAL; if (tid < 1 || tid >= pid_max[ns->level - i]) goto out_abort; - /* - * Also fail if a PID != 1 is requested and - * no PID 1 exists. - */ - if (tid != 1 && !tmp->child_reaper) - goto out_abort; retval = -EPERM; if (!checkpoint_restore_ns_capable(tmp->user_ns)) goto out_abort; @@ -236,6 +229,10 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, retried_preload = false; idr_preload(GFP_KERNEL); spin_lock(&pidmap_lock); + /* For the case when the previous attempt to create init failed */ + if (ns->pid_allocated == PIDNS_ADDING) + idr_set_cursor(&ns->idr, 0); + for (tmp = ns, i = ns->level; i >= 0;) { int tid = set_tid[ns->level - i]; @@ -296,9 +293,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; - tmp = tmp->parent; i--; retried_preload = false; + + /* + * PID 1 (init) must be created first. + */ + if (!READ_ONCE(tmp->child_reaper) && nr != 1) { + retval = -EINVAL; + goto out_free; + } + + tmp = tmp->parent; } /* @@ -311,6 +317,11 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, * * This can't be done earlier because we need to preserve other * error conditions. + * + * We need this even if copy_process() does the same check. If two + * or more tasks from parent namespace try to inject a child into a + * dead namespace, one of free_pid() calls from the copy_process() + * error path may try to wakeup the possibly freed ns->child_reaper. */ retval = -ENOMEM; if (unlikely(!(ns->pid_allocated & PIDNS_ADDING))) @@ -338,10 +349,6 @@ out_free: idr_remove(&upid->ns->idr, upid->nr); } - /* On failure to allocate the first pid, reset the state */ - if (ns->pid_allocated == PIDNS_ADDING) - idr_set_cursor(&ns->idr, 0); - spin_unlock(&pidmap_lock); idr_preload_end(); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e48f5de41361..d36afc58ee1d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -369,15 +369,6 @@ static struct ns_common *pidns_for_children_get(struct task_struct *task) } task_unlock(task); - if (ns) { - read_lock(&tasklist_lock); - if (!ns->child_reaper) { - put_pid_ns(ns); - ns = NULL; - } - read_unlock(&tasklist_lock); - } - return ns ? &ns->ns : NULL; } diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c index 5a611d3950fd..4d4fd29bd2be 100644 --- a/kernel/power/em_netlink.c +++ b/kernel/power/em_netlink.c @@ -109,6 +109,8 @@ int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb, id = nla_get_u32(info->attrs[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID]); pd = em_perf_domain_get_by_id(id); + if (!pd) + return -EINVAL; __em_nl_get_pd_size(pd, &msg_sz); msg = genlmsg_new(msg_sz, GFP_KERNEL); diff --git a/kernel/power/main.c b/kernel/power/main.c index 5f8c9e12eaec..5429e9f19b65 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -40,7 +40,7 @@ void pm_restore_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); - if (WARN_ON(!saved_gfp_count) || --saved_gfp_count) + if (!saved_gfp_count || --saved_gfp_count) return; gfp_allowed_mask = saved_gfp_mask; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 6e1321837c66..a564650734dc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2855,6 +2855,17 @@ int snapshot_write_finalize(struct snapshot_handle *handle) { int error; + /* + * Call snapshot_write_next() to drain any trailing zero pages, + * but make sure we're in the data page region first. + * This function can return PAGE_SIZE if the kernel was expecting + * another copy page. Return -ENODATA in that situation. + */ + if (handle->cur > nr_meta_pages + 1) { + error = snapshot_write_next(handle); + if (error) + return error > 0 ? -ENODATA : error; + } copy_last_highmem_page(); error = hibernate_restore_protect_page(handle->buffer); /* Do that only if we have loaded the image entirely */ diff --git a/kernel/power/user.c b/kernel/power/user.c index 4401cfe26e5c..be77f3556bd7 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -322,11 +322,14 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = snapshot_write_finalize(&data->handle); if (error) break; - if (data->mode != O_WRONLY || !data->frozen || - !snapshot_image_loaded(&data->handle)) { + if (data->mode != O_WRONLY || !data->frozen) { error = -EPERM; break; } + if (!snapshot_image_loaded(&data->handle)) { + error = -ENODATA; + break; + } error = hibernation_restore(data->platform_support); break; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 392ec2f75f01..68c17daef8d4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -549,7 +549,8 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!dead && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) dead = do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) { + else if (ignoring_children(tracer->sighand) || + p->signal->autoreap) { __wake_up_parent(p, tracer); dead = true; } diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 625d75392647..e078e988773d 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -228,4 +228,15 @@ config RCU_DYNTICKS_TORTURE This has no value for production and is only for testing. +config TRIVIAL_PREEMPT_RCU + bool "Textbook trivial preemptible RCU in rcutorture" + depends on RCU_EXPERT && RCU_TORTURE_TEST + default n + help + This option enables a textbook preemptible RCU that is + implemented in rcutorture. Its sole purpose is to validate + code used in books, papers, and presentations. + + This has no value for production and is only for testing. + endmenu # "RCU Debugging" diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index dc5d614b372c..fa6d30ce73d1 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -502,6 +502,15 @@ do { \ ___locked; \ }) +#define raw_spin_trylock_irqsave_rcu_node(p, flags) \ +({ \ + bool ___locked = raw_spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + \ + if (___locked) \ + smp_mb__after_unlock_lock(); \ + ___locked; \ +}) + #define raw_lockdep_assert_held_rcu_node(p) \ lockdep_assert_held(&ACCESS_PRIVATE(p, lock)) @@ -682,4 +691,8 @@ int rcu_stall_notifier_call_chain(unsigned long val, void *v); static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; } #endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) +#ifdef CONFIG_TRIVIAL_PREEMPT_RCU +void synchronize_rcu_trivial_preempt(void); +#endif // #ifdef CONFIG_TRIVIAL_PREEMPT_RCU + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 4ac2b134a983..ac0b1c6b7dae 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -79,12 +79,6 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); * test-end checks, and the pair of calls through pointers. */ -#ifdef MODULE -# define RCUSCALE_SHUTDOWN 0 -#else -# define RCUSCALE_SHUTDOWN 1 -#endif - torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); @@ -92,8 +86,8 @@ torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, minruntime, 0, "Minimum run time (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, RCUSCALE_SHUTDOWN, - "Shutdown at end of scalability tests."); +torture_param(int, shutdown_secs, !IS_MODULE(CONFIG_RCU_SCALE_TEST) * 300, + "Shutdown at end of scalability tests or at specified timeout (s)."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable"); @@ -123,7 +117,6 @@ static int nrealreaders; static int nrealwriters; static struct task_struct **writer_tasks; static struct task_struct **reader_tasks; -static struct task_struct *shutdown_task; static u64 **writer_durations; static bool *writer_done; @@ -132,7 +125,6 @@ static int *writer_n_durations; static atomic_t n_rcu_scale_reader_started; static atomic_t n_rcu_scale_writer_started; static atomic_t n_rcu_scale_writer_finished; -static wait_queue_head_t shutdown_wq; static u64 t_rcu_scale_writer_started; static u64 t_rcu_scale_writer_finished; static unsigned long b_rcu_gp_test_started; @@ -519,6 +511,8 @@ static void rcu_scale_async_cb(struct rcu_head *rhp) rcu_scale_free(wmbp); } +static void rcu_scale_cleanup(void); + /* * RCU scale writer kthread. Repeatedly does a grace period. */ @@ -622,9 +616,11 @@ rcu_scale_writer(void *arg) b_rcu_gp_test_finished = cur_ops->get_gp_seq(); } - if (shutdown) { + if (shutdown_secs) { + writer_tasks[me] = NULL; smp_mb(); /* Assign before wake. */ - wake_up(&shutdown_wq); + rcu_scale_cleanup(); + kernel_power_off(); } } } @@ -668,8 +664,8 @@ static void rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n", - scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown); + "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown_secs=%d\n", + scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown_secs); } /* @@ -722,6 +718,8 @@ static void kfree_call_rcu(struct rcu_head *rh) kfree(obj); } +static void kfree_scale_cleanup(void); + static int kfree_scale_thread(void *arg) { @@ -791,9 +789,11 @@ kfree_scale_thread(void *arg) rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), PAGES_TO_MB(mem_begin - mem_during)); - if (shutdown) { + if (shutdown_secs) { + kfree_reader_tasks[me] = NULL; smp_mb(); /* Assign before wake. */ - wake_up(&shutdown_wq); + kfree_scale_cleanup(); + kernel_power_off(); } } @@ -820,22 +820,6 @@ kfree_scale_cleanup(void) torture_cleanup_end(); } -/* - * shutdown kthread. Just waits to be awakened, then shuts down system. - */ -static int -kfree_scale_shutdown(void *arg) -{ - wait_event_idle(shutdown_wq, - atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads); - - smp_mb(); /* Wake before output. */ - - kfree_scale_cleanup(); - kernel_power_off(); - return -EINVAL; -} - // Used if doing RCU-kfree'ing via call_rcu(). static unsigned long jiffies_at_lazy_cb; static struct rcu_head lazy_test1_rh; @@ -895,13 +879,10 @@ kfree_scale_init(void) kfree_nrealthreads = compute_real(kfree_nthreads); /* Start up the kthreads. */ - if (shutdown) { - init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(kfree_scale_shutdown, NULL, - shutdown_task); + if (shutdown_secs) { + firsterr = torture_shutdown_init(shutdown_secs, kfree_scale_cleanup); if (torture_init_error(firsterr)) goto unwind; - schedule_timeout_uninterruptible(1); } pr_alert("kfree object size=%zu, kfree_by_call_rcu=%d\n", @@ -1058,20 +1039,6 @@ rcu_scale_cleanup(void) torture_cleanup_end(); } -/* - * RCU scalability shutdown kthread. Just waits to be awakened, then shuts - * down system. - */ -static int -rcu_scale_shutdown(void *arg) -{ - wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters); - smp_mb(); /* Wake before output. */ - rcu_scale_cleanup(); - kernel_power_off(); - return -EINVAL; -} - static int __init rcu_scale_init(void) { @@ -1121,13 +1088,10 @@ rcu_scale_init(void) /* Start up the kthreads. */ - if (shutdown) { - init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(rcu_scale_shutdown, NULL, - shutdown_task); + if (shutdown_secs) { + firsterr = torture_shutdown_init(shutdown_secs, rcu_scale_cleanup); if (torture_init_error(firsterr)) goto unwind; - schedule_timeout_uninterruptible(1); } reader_tasks = kzalloc_objs(reader_tasks[0], nrealreaders); if (reader_tasks == NULL) { @@ -1201,7 +1165,7 @@ rcu_scale_init(void) unwind: torture_init_end(); rcu_scale_cleanup(); - if (shutdown) { + if (shutdown_secs) { WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST)); kernel_power_off(); } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8a9282a0245c..5f2848b828dc 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -842,7 +842,14 @@ static unsigned long srcu_torture_completed(void) static void srcu_torture_deferred_free(struct rcu_torture *rp) { + unsigned long flags; + bool lockit = jiffies & 0x1; + + if (lockit) + raw_spin_lock_irqsave(¤t->pi_lock, flags); call_srcu(srcu_ctlp, &rp->rtort_rcu, rcu_torture_cb); + if (lockit) + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } static void srcu_torture_synchronize(void) @@ -1061,6 +1068,61 @@ static struct rcu_torture_ops trivial_ops = { .name = "trivial" }; +#ifdef CONFIG_TRIVIAL_PREEMPT_RCU + +/* + * Definitions for trivial CONFIG_PREEMPT=y torture testing. This + * implementation does not work well with large numbers of tasks or with + * long-term preemption. Either or both get you RCU CPU stall warnings. + */ + +static void rcu_sync_torture_init_trivial_preempt(void) +{ + rcu_sync_torture_init(); + if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) { + onoff_interval = 0; + shuffle_interval = 0; + } +} + +static int rcu_torture_read_lock_trivial_preempt(void) +{ + struct task_struct *t = current; + + WRITE_ONCE(t->rcu_trivial_preempt_nesting, t->rcu_trivial_preempt_nesting + 1); + smp_mb(); + return 0; +} + +static void rcu_torture_read_unlock_trivial_preempt(int idx) +{ + struct task_struct *t = current; + + smp_store_release(&t->rcu_trivial_preempt_nesting, t->rcu_trivial_preempt_nesting - 1); +} + +static struct rcu_torture_ops trivial_preempt_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init_trivial_preempt, + .readlock = rcu_torture_read_lock_trivial_preempt, + .read_delay = rcu_read_delay, // just reuse rcu's version. + .readunlock = rcu_torture_read_unlock_trivial_preempt, + .readlock_held = torture_readlock_not_held, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial_preempt, + .exp_sync = synchronize_rcu_trivial_preempt, + .irq_capable = 0, // In theory it should be, but let's keep it trivial. + .name = "trivial-preempt" +}; + +#define TRIVIAL_PREEMPT_OPS &trivial_preempt_ops, + +#else // #ifdef CONFIG_TRIVIAL_PREEMPT_RCU + +#define TRIVIAL_PREEMPT_OPS + +#endif // #else // #ifdef CONFIG_TRIVIAL_PREEMPT_RCU + #ifdef CONFIG_TASKS_RCU /* @@ -4449,7 +4511,7 @@ rcu_torture_init(void) static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, &busted_srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS - &trivial_ops, + &trivial_ops, TRIVIAL_PREEMPT_OPS }; if (!torture_init_begin(torture_type, verbose)) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index c158b6a947cd..a2d9d75d88a1 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -92,15 +92,9 @@ torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); torture_param(int, nruns, 30, "Number of experiments to run."); // Reader delay in nanoseconds, 0 for no delay. torture_param(int, readdelay, 0, "Read-side delay in nanoseconds."); - -#ifdef MODULE -# define REFSCALE_SHUTDOWN 0 -#else -# define REFSCALE_SHUTDOWN 1 -#endif - -torture_param(bool, shutdown, REFSCALE_SHUTDOWN, - "Shutdown at end of scalability tests."); +// Maximum shutdown delay in seconds, or zero for no shutdown. +torture_param(int, shutdown_secs, !IS_MODULE(CONFIG_REPRO_TEST) * 300, + "Shutdown at end of scalability tests or at specified timeout (s)."); struct reader_task { struct task_struct *task; @@ -109,12 +103,8 @@ struct reader_task { u64 last_duration_ns; }; -static struct task_struct *shutdown_task; -static wait_queue_head_t shutdown_wq; - static struct task_struct *main_task; static wait_queue_head_t main_wq; -static int shutdown_start; static struct reader_task *reader_tasks; @@ -1357,6 +1347,8 @@ static u64 process_durations(int n) return sum; } +static void ref_scale_cleanup(void); + // The main_func is the main orchestrator, it performs a bunch of // experiments. For every experiment, it orders all the readers // involved to start and waits for them to finish the experiment. It @@ -1443,9 +1435,10 @@ static int main_func(void *arg) oom_exit: // This will shutdown everything including us. - if (shutdown) { - shutdown_start = 1; - wake_up(&shutdown_wq); + if (shutdown_secs) { + main_task = NULL; // Avoid self-kill deadlock. + ref_scale_cleanup(); + kernel_power_off(); } // Wait for torture to stop us @@ -1463,8 +1456,8 @@ static void ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, - verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); + "--- %s: verbose=%d verbose_batched=%d shutdown_secs=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + verbose, verbose_batched, shutdown_secs, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); } static void @@ -1497,19 +1490,6 @@ ref_scale_cleanup(void) torture_cleanup_end(); } -// Shutdown kthread. Just waits to be awakened, then shuts down system. -static int -ref_scale_shutdown(void *arg) -{ - wait_event_idle(shutdown_wq, shutdown_start); - - smp_mb(); // Wake before output. - ref_scale_cleanup(); - kernel_power_off(); - - return -EINVAL; -} - static int __init ref_scale_init(void) { @@ -1553,13 +1533,10 @@ ref_scale_init(void) ref_scale_print_module_parms(cur_ops, "Start of test"); // Shutdown task - if (shutdown) { - init_waitqueue_head(&shutdown_wq); - firsterr = torture_create_kthread(ref_scale_shutdown, NULL, - shutdown_task); + if (shutdown_secs) { + firsterr = torture_shutdown_init(shutdown_secs, ref_scale_cleanup); if (torture_init_error(firsterr)) goto unwind; - schedule_timeout_uninterruptible(1); } // Reader tasks (default to ~75% of online CPUs). @@ -1604,7 +1581,7 @@ ref_scale_init(void) unwind: torture_init_end(); ref_scale_cleanup(); - if (shutdown) { + if (shutdown_secs) { WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); kernel_power_off(); } diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 3450c3751ef7..a2e2d516e51b 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -9,6 +9,7 @@ */ #include <linux/export.h> +#include <linux/irq_work.h> #include <linux/mutex.h> #include <linux/preempt.h> #include <linux/rcupdate_wait.h> @@ -41,6 +42,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp) ssp->srcu_idx_max = 0; INIT_WORK(&ssp->srcu_work, srcu_drive_gp); INIT_LIST_HEAD(&ssp->srcu_work.entry); + init_irq_work(&ssp->srcu_irq_work, srcu_tiny_irq_work); return 0; } @@ -84,6 +86,7 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); void cleanup_srcu_struct(struct srcu_struct *ssp) { WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); + irq_work_sync(&ssp->srcu_irq_work); flush_work(&ssp->srcu_work); WARN_ON(ssp->srcu_gp_running); WARN_ON(ssp->srcu_gp_waiting); @@ -177,6 +180,20 @@ void srcu_drive_gp(struct work_struct *wp) } EXPORT_SYMBOL_GPL(srcu_drive_gp); +/* + * Use an irq_work to defer schedule_work() to avoid acquiring the workqueue + * pool->lock while the caller might hold scheduler locks, causing lockdep + * splats due to workqueue_init() doing a wakeup. + */ +void srcu_tiny_irq_work(struct irq_work *irq_work) +{ + struct srcu_struct *ssp; + + ssp = container_of(irq_work, struct srcu_struct, srcu_irq_work); + schedule_work(&ssp->srcu_work); +} +EXPORT_SYMBOL_GPL(srcu_tiny_irq_work); + static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; @@ -189,7 +206,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) WRITE_ONCE(ssp->srcu_idx_max, cookie); if (!READ_ONCE(ssp->srcu_gp_running)) { if (likely(srcu_init_done)) - schedule_work(&ssp->srcu_work); + irq_work_queue(&ssp->srcu_irq_work); else if (list_empty(&ssp->srcu_work.entry)) list_add(&ssp->srcu_work.entry, &srcu_boot_list); } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index aef8e91ad33e..0d01cd8c4b4a 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -19,6 +19,7 @@ #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/preempt.h> +#include <linux/irq_work.h> #include <linux/rcupdate_wait.h> #include <linux/sched.h> #include <linux/smp.h> @@ -75,44 +76,9 @@ static bool __read_mostly srcu_init_done; static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); +static void srcu_irq_work(struct irq_work *work); static void srcu_delay_timer(struct timer_list *t); -/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ -#define spin_lock_rcu_node(p) \ -do { \ - spin_lock(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) - -#define spin_lock_irq_rcu_node(p) \ -do { \ - spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define spin_unlock_irq_rcu_node(p) \ - spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) - -#define spin_lock_irqsave_rcu_node(p, flags) \ -do { \ - spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - smp_mb__after_unlock_lock(); \ -} while (0) - -#define spin_trylock_irqsave_rcu_node(p, flags) \ -({ \ - bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ - \ - if (___locked) \ - smp_mb__after_unlock_lock(); \ - ___locked; \ -}) - -#define spin_unlock_irqrestore_rcu_node(p, flags) \ - spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ - /* * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and @@ -131,7 +97,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) */ for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; @@ -186,7 +152,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) /* Each pass through this loop initializes one srcu_node structure. */ srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_init(&ACCESS_PRIVATE(snp, lock)); + raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { @@ -242,7 +208,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->srcu_sup) return -ENOMEM; if (!is_static) - spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); + raw_spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); @@ -252,6 +218,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); + init_irq_work(&ssp->srcu_sup->irq_work, srcu_irq_work); ssp->srcu_sup->sda_is_static = is_static; if (!is_static) { ssp->sda = alloc_percpu(struct srcu_data); @@ -263,9 +230,12 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { - if (!init_srcu_struct_nodes(ssp, is_static ? GFP_ATOMIC : GFP_KERNEL)) + if (!preemptible()) + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC); + else if (init_srcu_struct_nodes(ssp, GFP_KERNEL)) + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); + else goto err_free_sda; - WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } ssp->srcu_sup->srcu_ssp = ssp; smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, @@ -394,20 +364,20 @@ static void srcu_transition_to_big(struct srcu_struct *ssp) /* Double-checked locking on ->srcu_size-state. */ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) return; - spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) { - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } __srcu_transition_to_big(ssp); - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* * Check to see if the just-encountered contention event justifies * a transition to SRCU_SIZE_BIG. */ -static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) +static void raw_spin_lock_irqsave_check_contention(struct srcu_struct *ssp) { unsigned long j; @@ -429,16 +399,16 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module * parameter permits this. */ -static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) +static void raw_spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) { struct srcu_struct *ssp = sdp->ssp; - if (spin_trylock_irqsave_rcu_node(sdp, *flags)) + if (raw_spin_trylock_irqsave_rcu_node(sdp, *flags)) return; - spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); - spin_lock_irqsave_check_contention(ssp); - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); - spin_lock_irqsave_rcu_node(sdp, *flags); + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); + raw_spin_lock_irqsave_check_contention(ssp); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); + raw_spin_lock_irqsave_rcu_node(sdp, *flags); } /* @@ -447,12 +417,12 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module * parameter permits this. */ -static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) +static void raw_spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) { - if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) + if (raw_spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) return; - spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); - spin_lock_irqsave_check_contention(ssp); + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); + raw_spin_lock_irqsave_check_contention(ssp); } /* @@ -470,13 +440,13 @@ static void check_init_srcu_struct(struct srcu_struct *ssp) /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) { - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); return; } init_srcu_struct_fields(ssp, true); - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -742,13 +712,15 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) unsigned long delay; struct srcu_usage *sup = ssp->srcu_sup; - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); delay = srcu_get_delay(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); if (WARN_ON(!delay)) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ + /* Wait for irq_work to finish first as it may queue a new work. */ + irq_work_sync(&sup->irq_work); flush_delayed_work(&sup->work); for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); @@ -960,7 +932,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) mutex_lock(&sup->srcu_cb_mutex); /* End the current grace period. */ - spin_lock_irq_rcu_node(sup); + raw_spin_lock_irq_rcu_node(sup); idx = rcu_seq_state(sup->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); if (srcu_gp_is_expedited(ssp)) @@ -971,7 +943,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) gpseq = rcu_seq_current(&sup->srcu_gp_seq); if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq)) WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq); - spin_unlock_irq_rcu_node(sup); + raw_spin_unlock_irq_rcu_node(sup); mutex_unlock(&sup->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -983,7 +955,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) } else { idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); srcu_for_each_node_breadth_first(ssp, snp) { - spin_lock_irq_rcu_node(snp); + raw_spin_lock_irq_rcu_node(snp); cbs = false; last_lvl = snp >= sup->level[rcu_num_lvls - 1]; if (last_lvl) @@ -998,7 +970,7 @@ static void srcu_gp_end(struct srcu_struct *ssp) else mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; - spin_unlock_irq_rcu_node(snp); + raw_spin_unlock_irq_rcu_node(snp); if (cbs) srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); } @@ -1008,27 +980,27 @@ static void srcu_gp_end(struct srcu_struct *ssp) if (!(gpseq & counter_wrap_check)) for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); - spin_lock_irq_rcu_node(sdp); + raw_spin_lock_irq_rcu_node(sdp); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) sdp->srcu_gp_seq_needed_exp = gpseq; - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); } /* Callback initiation done, allow grace periods after next. */ mutex_unlock(&sup->srcu_cb_mutex); /* Start a new grace period if needed. */ - spin_lock_irq_rcu_node(sup); + raw_spin_lock_irq_rcu_node(sup); gpseq = rcu_seq_current(&sup->srcu_gp_seq); if (!rcu_seq_state(gpseq) && ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) { srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(sup); + raw_spin_unlock_irq_rcu_node(sup); srcu_reschedule(ssp, 0); } else { - spin_unlock_irq_rcu_node(sup); + raw_spin_unlock_irq_rcu_node(sup); } /* Transition to big if needed. */ @@ -1059,19 +1031,19 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) || (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) return; - spin_lock_irqsave_rcu_node(snp, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); sgsne = snp->srcu_gp_seq_needed_exp; if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { - spin_unlock_irqrestore_rcu_node(snp, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); return; } WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } - spin_lock_irqsave_ssp_contention(ssp, &flags); + raw_spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s)) WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); } /* @@ -1109,12 +1081,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf) return; /* GP already done and CBs recorded. */ - spin_lock_irqsave_rcu_node(snp, flags); + raw_spin_lock_irqsave_rcu_node(snp, flags); snp_seq = snp->srcu_have_cbs[idx]; if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { if (snp == snp_leaf && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - spin_unlock_irqrestore_rcu_node(snp, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == snp_leaf && snp_seq != s) { srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); return; @@ -1129,11 +1101,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, sgsne = snp->srcu_gp_seq_needed_exp; if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - spin_unlock_irqrestore_rcu_node(snp, flags); + raw_spin_unlock_irqrestore_rcu_node(snp, flags); } /* Top of tree, must ensure the grace period will be started. */ - spin_lock_irqsave_ssp_contention(ssp, &flags); + raw_spin_lock_irqsave_ssp_contention(ssp, &flags); if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -1154,13 +1126,17 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, // it isn't. And it does not have to be. After all, it // can only be executed during early boot when there is only // the one boot CPU running with interrupts still disabled. + // + // Use an irq_work here to avoid acquiring runqueue lock with + // srcu rcu_node::lock held. BPF instrument could introduce the + // opposite dependency, hence we need to break the possible + // locking dependency here. if (likely(srcu_init_done)) - queue_delayed_work(rcu_gp_wq, &sup->work, - !!srcu_get_delay(ssp)); + irq_work_queue(&sup->irq_work); else if (list_empty(&sup->work.work.entry)) list_add(&sup->work.work.entry, &srcu_boot_list); } - spin_unlock_irqrestore_rcu_node(sup, flags); + raw_spin_unlock_irqrestore_rcu_node(sup, flags); } /* @@ -1172,9 +1148,9 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { unsigned long curdelay; - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = !srcu_get_delay(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) @@ -1285,12 +1261,12 @@ static bool srcu_should_expedite(struct srcu_struct *ssp) return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_rcu_node(sdp, flags); + raw_spin_lock_irqsave_rcu_node(sdp, flags); if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); return false; /* Callbacks already present, so not idle. */ } - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); /* * No local callbacks, so probabilistically probe global state. @@ -1350,7 +1326,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_sdp_contention(sdp, &flags); + raw_spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); /* @@ -1410,7 +1386,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, sdp->srcu_gp_seq_needed_exp = s; needexp = true; } - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); /* Ensure that snp node tree is fully initialized before traversing it */ if (ss_state < SRCU_SIZE_WAIT_BARRIER) @@ -1522,7 +1498,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) /* * Make sure that later code is ordered after the SRCU grace - * period. This pairs with the spin_lock_irq_rcu_node() + * period. This pairs with the raw_spin_lock_irq_rcu_node() * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed * because the current CPU might have been totally uninvolved with * (and thus unordered against) that grace period. @@ -1701,7 +1677,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp) */ static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) { - spin_lock_irq_rcu_node(sdp); + raw_spin_lock_irq_rcu_node(sdp); atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); @@ -1710,7 +1686,7 @@ static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt); } - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); } /** @@ -1761,7 +1737,7 @@ static void srcu_expedite_current_cb(struct rcu_head *rhp) bool needcb = false; struct srcu_data *sdp = container_of(rhp, struct srcu_data, srcu_ec_head); - spin_lock_irqsave_sdp_contention(sdp, &flags); + raw_spin_lock_irqsave_sdp_contention(sdp, &flags); if (sdp->srcu_ec_state == SRCU_EC_IDLE) { WARN_ON_ONCE(1); } else if (sdp->srcu_ec_state == SRCU_EC_PENDING) { @@ -1771,7 +1747,7 @@ static void srcu_expedite_current_cb(struct rcu_head *rhp) sdp->srcu_ec_state = SRCU_EC_PENDING; needcb = true; } - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); // If needed, requeue ourselves as an expedited SRCU callback. if (needcb) __call_srcu(sdp->ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); @@ -1795,7 +1771,7 @@ void srcu_expedite_current(struct srcu_struct *ssp) migrate_disable(); sdp = this_cpu_ptr(ssp->sda); - spin_lock_irqsave_sdp_contention(sdp, &flags); + raw_spin_lock_irqsave_sdp_contention(sdp, &flags); if (sdp->srcu_ec_state == SRCU_EC_IDLE) { sdp->srcu_ec_state = SRCU_EC_PENDING; needcb = true; @@ -1804,7 +1780,7 @@ void srcu_expedite_current(struct srcu_struct *ssp) } else { WARN_ON_ONCE(sdp->srcu_ec_state != SRCU_EC_REPOST); } - spin_unlock_irqrestore_rcu_node(sdp, flags); + raw_spin_unlock_irqrestore_rcu_node(sdp, flags); // If needed, queue an expedited SRCU callback. if (needcb) __call_srcu(ssp, &sdp->srcu_ec_head, srcu_expedite_current_cb, false); @@ -1848,17 +1824,17 @@ static void srcu_advance_state(struct srcu_struct *ssp) */ idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); if (idx != SRCU_STATE_IDLE) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* Someone else started the grace period. */ @@ -1872,10 +1848,10 @@ static void srcu_advance_state(struct srcu_struct *ssp) return; /* readers present, retry later. */ } srcu_flip(ssp); - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2); ssp->srcu_sup->srcu_n_exp_nodelay = 0; - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); } if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) { @@ -1913,7 +1889,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); - spin_lock_irq_rcu_node(sdp); + raw_spin_lock_irq_rcu_node(sdp); WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); @@ -1924,7 +1900,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) */ if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); return; /* Someone else on the job or nothing to do. */ } @@ -1932,7 +1908,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); len = ready_cbs.len; - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); @@ -1947,11 +1923,11 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ - spin_lock_irq_rcu_node(sdp); + raw_spin_lock_irq_rcu_node(sdp); rcu_segcblist_add_len(&sdp->srcu_cblist, -len); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); - spin_unlock_irq_rcu_node(sdp); + raw_spin_unlock_irq_rcu_node(sdp); /* An SRCU barrier or callbacks from previous nesting work pending */ if (more) srcu_schedule_cbs_sdp(sdp, 0); @@ -1965,7 +1941,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) { bool pushgp = true; - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ @@ -1975,7 +1951,7 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) /* Outstanding request and no GP. Start one. */ srcu_gp_start(ssp); } - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); if (pushgp) queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay); @@ -1995,9 +1971,9 @@ static void process_srcu(struct work_struct *work) ssp = sup->srcu_ssp; srcu_advance_state(ssp); - spin_lock_irq_rcu_node(ssp->srcu_sup); + raw_spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = srcu_get_delay(ssp); - spin_unlock_irq_rcu_node(ssp->srcu_sup); + raw_spin_unlock_irq_rcu_node(ssp->srcu_sup); if (curdelay) { WRITE_ONCE(sup->reschedule_count, 0); } else { @@ -2015,6 +1991,23 @@ static void process_srcu(struct work_struct *work) srcu_reschedule(ssp, curdelay); } +static void srcu_irq_work(struct irq_work *work) +{ + struct srcu_struct *ssp; + struct srcu_usage *sup; + unsigned long delay; + unsigned long flags; + + sup = container_of(work, struct srcu_usage, irq_work); + ssp = sup->srcu_ssp; + + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); + delay = srcu_get_delay(ssp); + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); + + queue_delayed_work(rcu_gp_wq, &sup->work, !!delay); +} + void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags, unsigned long *gp_seq) { diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 2b55e6acf3c1..48f0d803c8e2 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -291,9 +291,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp) shift = ilog2(rcu_task_cpu_ids / lim); if (((rcu_task_cpu_ids - 1) >> shift) >= lim) shift++; - WRITE_ONCE(rtp->percpu_enqueue_shift, shift); - WRITE_ONCE(rtp->percpu_dequeue_lim, lim); - smp_store_release(&rtp->percpu_enqueue_lim, lim); + rtp->percpu_enqueue_shift = shift; + rtp->percpu_dequeue_lim = lim; + rtp->percpu_enqueue_lim = lim; pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n", rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index b3337c7231cc..1047b30cd46b 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -379,6 +379,38 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) } /* + * Determine if the bypass queue needs to be flushed based on time and size. + * For lazy-only bypass queues, use the lazy flush timeout; otherwise flush + * based on jiffy advancement. The flush_faster controls flush aggressiveness. + */ +static bool nocb_bypass_needs_flush(struct rcu_data *rdp, long bypass_ncbs, + long lazy_ncbs, unsigned long j, + bool flush_faster) +{ + bool bypass_is_lazy; + unsigned long bypass_first; + unsigned long flush_timeout; + long qhimark_thresh; + + if (!bypass_ncbs) + return false; + + qhimark_thresh = flush_faster ? qhimark : 2 * qhimark; + if (bypass_ncbs >= qhimark_thresh) + return true; + + bypass_first = READ_ONCE(rdp->nocb_bypass_first); + bypass_is_lazy = (bypass_ncbs == lazy_ncbs); + + if (bypass_is_lazy) + flush_timeout = rcu_get_jiffies_lazy_flush(); + else + flush_timeout = flush_faster ? 0 : 1; + + return time_after(j, bypass_first + flush_timeout); +} + +/* * See whether it is appropriate to use the ->nocb_bypass list in order * to control contention on ->nocb_lock. A limited number of direct * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass @@ -404,7 +436,8 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long cur_gp_seq; unsigned long j = jiffies; long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - bool bypass_is_lazy = (ncbs == READ_ONCE(rdp->lazy_len)); + long lazy_len = READ_ONCE(rdp->lazy_len); + bool bypass_is_lazy = (ncbs == lazy_len); lockdep_assert_irqs_disabled(); @@ -456,10 +489,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, // If ->nocb_bypass has been used too long or is too full, // flush ->nocb_bypass to ->cblist. - if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) || - (ncbs && bypass_is_lazy && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) || - ncbs >= qhimark) { + if (nocb_bypass_needs_flush(rdp, ncbs, lazy_len, j, true)) { rcu_nocb_lock(rdp); *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); @@ -673,15 +703,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); lazy_ncbs = READ_ONCE(rdp->lazy_len); - if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) || - bypass_ncbs > 2 * qhimark)) { - flush_bypass = true; - } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || - bypass_ncbs > 2 * qhimark)) { - flush_bypass = true; - } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { + flush_bypass = nocb_bypass_needs_flush(rdp, bypass_ncbs, lazy_ncbs, j, false); + if (!flush_bypass && !bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { rcu_nocb_unlock_irqrestore(rdp, flags); continue; /* No callbacks here, try next. */ } @@ -1081,30 +1104,6 @@ static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) return 0; } -int rcu_nocb_cpu_deoffload(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - int ret = 0; - - cpus_read_lock(); - mutex_lock(&rcu_state.nocb_mutex); - if (rcu_rdp_is_offloaded(rdp)) { - if (!cpu_online(cpu)) { - ret = rcu_nocb_rdp_deoffload(rdp); - if (!ret) - cpumask_clear_cpu(cpu, rcu_nocb_mask); - } else { - pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu); - ret = -EINVAL; - } - } - mutex_unlock(&rcu_state.nocb_mutex); - cpus_read_unlock(); - - return ret; -} -EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); - static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp) { unsigned long flags; @@ -1149,28 +1148,52 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) return 0; } -int rcu_nocb_cpu_offload(int cpu) +/* Common helper for CPU offload/deoffload operations. */ +static int rcu_nocb_cpu_toggle_offload(int cpu, bool offload) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); int ret = 0; cpus_read_lock(); mutex_lock(&rcu_state.nocb_mutex); - if (!rcu_rdp_is_offloaded(rdp)) { - if (!cpu_online(cpu)) { - ret = rcu_nocb_rdp_offload(rdp); - if (!ret) - cpumask_set_cpu(cpu, rcu_nocb_mask); - } else { - pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu); - ret = -EINVAL; - } + + /* Already in desired state, nothing to do. */ + if (rcu_rdp_is_offloaded(rdp) == offload) + goto out_unlock; + + if (cpu_online(cpu)) { + pr_info("NOCB: Cannot CB-%soffload online CPU %d\n", + offload ? "" : "de", rdp->cpu); + ret = -EINVAL; + goto out_unlock; } + + if (offload) { + ret = rcu_nocb_rdp_offload(rdp); + if (!ret) + cpumask_set_cpu(cpu, rcu_nocb_mask); + } else { + ret = rcu_nocb_rdp_deoffload(rdp); + if (!ret) + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } + +out_unlock: mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); - return ret; } + +int rcu_nocb_cpu_deoffload(int cpu) +{ + return rcu_nocb_cpu_toggle_offload(cpu, false /* de-offload */); +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); + +int rcu_nocb_cpu_offload(int cpu) +{ + return rcu_nocb_cpu_toggle_offload(cpu, true /* offload */); +} EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); #ifdef CONFIG_RCU_LAZY diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index d98a5c38e19c..b62735a67884 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -538,6 +538,28 @@ long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool do EXPORT_SYMBOL_GPL(torture_sched_setaffinity); #endif +#if IS_ENABLED(CONFIG_TRIVIAL_PREEMPT_RCU) +// Trivial and stupid grace-period wait. Defined here so that lockdep +// kernels can find tasklist_lock. +void synchronize_rcu_trivial_preempt(void) +{ + struct task_struct *g; + struct task_struct *t; + + smp_mb(); // Order prior accesses before grace-period start. + rcu_read_lock(); // Protect task list. + for_each_process_thread(g, t) { + if (t == current) + continue; // Don't deadlock on ourselves! + // Order later rcu_read_lock() on other tasks after QS. + while (smp_load_acquire(&t->rcu_trivial_preempt_nesting)) + continue; + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_trivial_preempt); +#endif // #if IS_ENABLED(CONFIG_TRIVIAL_PREEMPT_RCU) + int rcu_cpu_stall_notifiers __read_mostly; // !0 = provide stall notifiers (rarely useful) EXPORT_SYMBOL_GPL(rcu_cpu_stall_notifiers); diff --git a/kernel/resource.c b/kernel/resource.c index bb966699da31..d02a53fb95d8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -727,45 +727,46 @@ static int __find_resource_space(struct resource *root, struct resource *old, struct resource_constraint *constraint) { struct resource *this = root->child; - struct resource tmp = *new, avail, alloc; + struct resource full_avail = *new, avail, alloc; resource_alignf alignf = constraint->alignf; - tmp.start = root->start; + full_avail.start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment - * of this->start - 1 to tmp->end below would cause an underflow. + * of this->start - 1 to full_avail->end below would cause an underflow. */ if (this && this->start == root->start) { - tmp.start = (this == old) ? old->start : this->end + 1; + full_avail.start = (this == old) ? old->start : this->end + 1; this = this->sibling; } for(;;) { if (this) - tmp.end = (this == old) ? this->end : this->start - 1; + full_avail.end = (this == old) ? this->end : this->start - 1; else - tmp.end = root->end; + full_avail.end = root->end; - if (tmp.end < tmp.start) + if (full_avail.end < full_avail.start) goto next; - resource_clip(&tmp, constraint->min, constraint->max); - arch_remove_reservations(&tmp); + resource_clip(&full_avail, constraint->min, constraint->max); + arch_remove_reservations(&full_avail); /* Check for overflow after ALIGN() */ - avail.start = ALIGN(tmp.start, constraint->align); - avail.end = tmp.end; - avail.flags = new->flags & ~IORESOURCE_UNSET; - if (avail.start >= tmp.start) { + avail.start = ALIGN(full_avail.start, constraint->align); + avail.end = full_avail.end; + avail.flags = new->flags; + if (avail.start >= full_avail.start) { alloc.flags = avail.flags; if (alignf) { alloc.start = alignf(constraint->alignf_data, - &avail, size, constraint->align); + &avail, &full_avail, + size, constraint->align); } else { alloc.start = avail.start; } alloc.end = alloc.start + size - 1; if (alloc.start <= alloc.end && - resource_contains(&avail, &alloc)) { + __resource_contains_unbound(&full_avail, &alloc)) { new->start = alloc.start; new->end = alloc.end; return 0; @@ -776,7 +777,7 @@ next: if (!this || this->end == root->end) break; if (this != old) - tmp.start = this->end + 1; + full_avail.start = this->end + 1; this = this->sibling; } return -EBUSY; diff --git a/kernel/rseq.c b/kernel/rseq.c index b0973d19f366..38d3ef540760 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -80,6 +80,7 @@ #include <linux/syscalls.h> #include <linux/uaccess.h> #include <linux/types.h> +#include <linux/rseq.h> #include <asm/ptrace.h> #define CREATE_TRACE_POINTS @@ -449,13 +450,14 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq * size, the required alignment is the original struct rseq alignment. * - * In order to be valid, rseq_len is either the original rseq size, or - * large enough to contain all supported fields, as communicated to + * The rseq_len is required to be greater or equal to the original rseq + * size. In order to be valid, rseq_len is either the original rseq size, + * or large enough to contain all supported fields, as communicated to * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. */ if (rseq_len < ORIG_RSEQ_SIZE || (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || - (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || + (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) || rseq_len < offsetof(struct rseq, end)))) return -EINVAL; if (!access_ok(rseq, rseq_len)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 759777694c78..da20fb6ea25a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -122,6 +122,11 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_entry_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_exit_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_set_need_resched_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_throttle_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_replenish_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_update_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_server_start_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_server_stop_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); @@ -687,11 +692,6 @@ bool raw_spin_rq_trylock(struct rq *rq) } } -void raw_spin_rq_unlock(struct rq *rq) -{ - raw_spin_unlock(rq_lockp(rq)); -} - /* * double_rq_lock - safely lock two runqueues */ @@ -872,7 +872,14 @@ void update_rq_clock(struct rq *rq) * Use HR-timers to deliver accurate preemption points. */ -static void hrtick_clear(struct rq *rq) +enum { + HRTICK_SCHED_NONE = 0, + HRTICK_SCHED_DEFER = BIT(1), + HRTICK_SCHED_START = BIT(2), + HRTICK_SCHED_REARM_HRTIMER = BIT(3) +}; + +static void __used hrtick_clear(struct rq *rq) { if (hrtimer_active(&rq->hrtick_timer)) hrtimer_cancel(&rq->hrtick_timer); @@ -897,12 +904,24 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void __hrtick_restart(struct rq *rq) +static inline bool hrtick_needs_rearm(struct hrtimer *timer, ktime_t expires) +{ + /* + * Queued is false when the timer is not started or currently + * running the callback. In both cases, restart. If queued check + * whether the expiry time actually changes substantially. + */ + return !hrtimer_is_queued(timer) || + abs(expires - hrtimer_get_expires(timer)) > 5000; +} + +static void hrtick_cond_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; ktime_t time = rq->hrtick_time; - hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); + if (hrtick_needs_rearm(timer, time)) + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -914,7 +933,7 @@ static void __hrtick_start(void *arg) struct rq_flags rf; rq_lock(rq, &rf); - __hrtick_restart(rq); + hrtick_cond_restart(rq); rq_unlock(rq, &rf); } @@ -925,7 +944,6 @@ static void __hrtick_start(void *arg) */ void hrtick_start(struct rq *rq, u64 delay) { - struct hrtimer *timer = &rq->hrtick_timer; s64 delta; /* @@ -933,27 +951,67 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - rq->hrtick_time = ktime_add_ns(hrtimer_cb_get_time(timer), delta); + + /* + * If this is in the middle of schedule() only note the delay + * and let hrtick_schedule_exit() deal with it. + */ + if (rq->hrtick_sched) { + rq->hrtick_sched |= HRTICK_SCHED_START; + rq->hrtick_delay = delta; + return; + } + + rq->hrtick_time = ktime_add_ns(ktime_get(), delta); + if (!hrtick_needs_rearm(&rq->hrtick_timer, rq->hrtick_time)) + return; if (rq == this_rq()) - __hrtick_restart(rq); + hrtimer_start(&rq->hrtick_timer, rq->hrtick_time, HRTIMER_MODE_ABS_PINNED_HARD); else smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); } -static void hrtick_rq_init(struct rq *rq) +static inline void hrtick_schedule_enter(struct rq *rq) { - INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); - hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + rq->hrtick_sched = HRTICK_SCHED_DEFER; + if (hrtimer_test_and_clear_rearm_deferred()) + rq->hrtick_sched |= HRTICK_SCHED_REARM_HRTIMER; } -#else /* !CONFIG_SCHED_HRTICK: */ -static inline void hrtick_clear(struct rq *rq) + +static inline void hrtick_schedule_exit(struct rq *rq) { + if (rq->hrtick_sched & HRTICK_SCHED_START) { + rq->hrtick_time = ktime_add_ns(ktime_get(), rq->hrtick_delay); + hrtick_cond_restart(rq); + } else if (idle_rq(rq)) { + /* + * No need for using hrtimer_is_active(). The timer is CPU local + * and interrupts are disabled, so the callback cannot be + * running and the queued state is valid. + */ + if (hrtimer_is_queued(&rq->hrtick_timer)) + hrtimer_cancel(&rq->hrtick_timer); + } + + if (rq->hrtick_sched & HRTICK_SCHED_REARM_HRTIMER) + __hrtimer_rearm_deferred(); + + rq->hrtick_sched = HRTICK_SCHED_NONE; } -static inline void hrtick_rq_init(struct rq *rq) +static void hrtick_rq_init(struct rq *rq) { + INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); + rq->hrtick_sched = HRTICK_SCHED_NONE; + hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD | HRTIMER_MODE_LAZY_REARM); } +#else /* !CONFIG_SCHED_HRTICK: */ +static inline void hrtick_clear(struct rq *rq) { } +static inline void hrtick_rq_init(struct rq *rq) { } +static inline void hrtick_schedule_enter(struct rq *rq) { } +static inline void hrtick_schedule_exit(struct rq *rq) { } #endif /* !CONFIG_SCHED_HRTICK */ /* @@ -3847,6 +3905,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { + int this_cpu = smp_processor_id(); + /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ if (!scx_allow_ttwu_queue(p)) return false; @@ -3871,10 +3931,10 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) * If the CPU does not share cache, then queue the task on the * remote rqs wakelist to avoid accessing remote data. */ - if (!cpus_share_cache(smp_processor_id(), cpu)) + if (!cpus_share_cache(this_cpu, cpu)) return true; - if (cpu == smp_processor_id()) + if (cpu == this_cpu) return false; /* @@ -4721,7 +4781,7 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); - return scx_fork(p); + return scx_fork(p, kargs); } void sched_cancel_fork(struct task_struct *p) @@ -4729,13 +4789,16 @@ void sched_cancel_fork(struct task_struct *p) scx_cancel_fork(p); } +static void sched_mm_cid_fork(struct task_struct *t); + void sched_post_fork(struct task_struct *p) { + sched_mm_cid_fork(p); uclamp_post_fork(p); scx_post_fork(p); } -unsigned long to_ratio(u64 period, u64 runtime) +u64 to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) return BW_UNIT; @@ -4910,6 +4973,34 @@ static inline void finish_task(struct task_struct *prev) smp_store_release(&prev->on_cpu, 0); } +/* + * Only called from __schedule context + * + * There are some cases where we are going to re-do the action + * that added the balance callbacks. We may not be in a state + * where we can run them, so just zap them so they can be + * properly re-added on the next time around. This is similar + * handling to running the callbacks, except we just don't call + * them. + */ +static void zap_balance_callbacks(struct rq *rq) +{ + struct balance_callback *next, *head; + bool found = false; + + lockdep_assert_rq_held(rq); + + head = rq->balance_callback; + while (head) { + if (head == &balance_push_callback) + found = true; + next = head->next; + head->next = NULL; + head = next; + } + rq->balance_callback = found ? &balance_push_callback : NULL; +} + static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) { void (*func)(struct rq *rq); @@ -5029,6 +5120,7 @@ static inline void finish_lock_switch(struct rq *rq) */ spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); __balance_callbacks(rq, NULL); + hrtick_schedule_exit(rq); raw_spin_rq_unlock_irq(rq); } @@ -5678,7 +5770,7 @@ static void sched_tick_remote(struct work_struct *work) os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); if (os == TICK_SCHED_REMOTE_RUNNING) - queue_delayed_work(system_unbound_wq, dwork, HZ); + queue_delayed_work(system_dfl_wq, dwork, HZ); } static void sched_tick_start(int cpu) @@ -5697,7 +5789,7 @@ static void sched_tick_start(int cpu) if (os == TICK_SCHED_REMOTE_OFFLINE) { twork->cpu = cpu; INIT_DELAYED_WORK(&twork->work, sched_tick_remote); - queue_delayed_work(system_unbound_wq, &twork->work, HZ); + queue_delayed_work(system_dfl_wq, &twork->work, HZ); } } @@ -6495,6 +6587,8 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, if (signal_pending_state(task_state, p)) { WRITE_ONCE(p->__state, TASK_RUNNING); *task_state_p = TASK_RUNNING; + set_task_blocked_on_waking(p, NULL); + return false; } @@ -6532,6 +6626,21 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SCHED_PROXY_EXEC +static inline void proxy_set_task_cpu(struct task_struct *p, int cpu) +{ + unsigned int wake_cpu; + + /* + * Since we are enqueuing a blocked task on a cpu it may + * not be able to run on, preserve wake_cpu when we + * __set_task_cpu so we can return the task to where it + * was previously runnable. + */ + wake_cpu = p->wake_cpu; + __set_task_cpu(p, cpu); + p->wake_cpu = wake_cpu; +} + static inline struct task_struct *proxy_resched_idle(struct rq *rq) { put_prev_set_next_task(rq, rq->donor, rq->idle); @@ -6540,7 +6649,7 @@ static inline struct task_struct *proxy_resched_idle(struct rq *rq) return rq->idle; } -static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) +static bool proxy_deactivate(struct rq *rq, struct task_struct *donor) { unsigned long state = READ_ONCE(donor->__state); @@ -6560,17 +6669,140 @@ static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) return try_to_block_task(rq, donor, &state, true); } -static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor) +static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf) + __releases(__rq_lockp(rq)) +{ + /* + * The class scheduler may have queued a balance callback + * from pick_next_task() called earlier. + * + * So here we have to zap callbacks before unlocking the rq + * as another CPU may jump in and call sched_balance_rq + * which can trip the warning in rq_pin_lock() if we + * leave callbacks set. + * + * After we later reaquire the rq lock, we will force __schedule() + * to pick_again, so the callbacks will get re-established. + */ + zap_balance_callbacks(rq); + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock(rq); +} + +static inline void proxy_reacquire_rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(__rq_lockp(rq)) +{ + raw_spin_rq_lock(rq); + rq_repin_lock(rq, rf); + update_rq_clock(rq); +} + +/* + * If the blocked-on relationship crosses CPUs, migrate @p to the + * owner's CPU. + * + * This is because we must respect the CPU affinity of execution + * contexts (owner) but we can ignore affinity for scheduling + * contexts (@p). So we have to move scheduling contexts towards + * potential execution contexts. + * + * Note: The owner can disappear, but simply migrate to @target_cpu + * and leave that CPU to sort things out. + */ +static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int target_cpu) + __must_hold(__rq_lockp(rq)) +{ + struct rq *target_rq = cpu_rq(target_cpu); + + lockdep_assert_rq_held(rq); + WARN_ON(p == rq->curr); + /* + * Since we are migrating a blocked donor, it could be rq->donor, + * and we want to make sure there aren't any references from this + * rq to it before we drop the lock. This avoids another cpu + * jumping in and grabbing the rq lock and referencing rq->donor + * or cfs_rq->curr, etc after we have migrated it to another cpu, + * and before we pick_again in __schedule. + * + * So call proxy_resched_idle() to drop the rq->donor references + * before we release the lock. + */ + proxy_resched_idle(rq); + + deactivate_task(rq, p, DEQUEUE_NOCLOCK); + proxy_set_task_cpu(p, target_cpu); + + proxy_release_rq_lock(rq, rf); + + attach_one_task(target_rq, p); + + proxy_reacquire_rq_lock(rq, rf); +} + +static void proxy_force_return(struct rq *rq, struct rq_flags *rf, + struct task_struct *p) + __must_hold(__rq_lockp(rq)) { - if (!__proxy_deactivate(rq, donor)) { + struct rq *task_rq, *target_rq = NULL; + int cpu, wake_flag = WF_TTWU; + + lockdep_assert_rq_held(rq); + WARN_ON(p == rq->curr); + + if (p == rq->donor) + proxy_resched_idle(rq); + + proxy_release_rq_lock(rq, rf); + /* + * We drop the rq lock, and re-grab task_rq_lock to get + * the pi_lock (needed for select_task_rq) as well. + */ + scoped_guard (task_rq_lock, p) { + task_rq = scope.rq; + /* - * XXX: For now, if deactivation failed, set donor - * as unblocked, as we aren't doing proxy-migrations - * yet (more logic will be needed then). + * Since we let go of the rq lock, the task may have been + * woken or migrated to another rq before we got the + * task_rq_lock. So re-check we're on the same RQ. If + * not, the task has already been migrated and that CPU + * will handle any futher migrations. */ - donor->blocked_on = NULL; + if (task_rq != rq) + break; + + /* + * Similarly, if we've been dequeued, someone else will + * wake us + */ + if (!task_on_rq_queued(p)) + break; + + /* + * Since we should only be calling here from __schedule() + * -> find_proxy_task(), no one else should have + * assigned current out from under us. But check and warn + * if we see this, then bail. + */ + if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) { + WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d on_cpu: %i\n", + __func__, cpu_of(task_rq), + p->comm, p->pid, p->on_cpu); + break; + } + + update_rq_clock(task_rq); + deactivate_task(task_rq, p, DEQUEUE_NOCLOCK); + cpu = select_task_rq(p, p->wake_cpu, &wake_flag); + set_task_cpu(p, cpu); + target_rq = cpu_rq(cpu); + clear_task_blocked_on(p, NULL); } - return NULL; + + if (target_rq) + attach_one_task(target_rq, p); + + proxy_reacquire_rq_lock(rq, rf); } /* @@ -6584,31 +6816,41 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d * p->pi_lock * rq->lock * mutex->wait_lock + * p->blocked_lock * * Returns the task that is going to be used as execution context (the one * that is actually going to be run on cpu_of(rq)). */ static struct task_struct * find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) + __must_hold(__rq_lockp(rq)) { struct task_struct *owner = NULL; + bool curr_in_chain = false; int this_cpu = cpu_of(rq); struct task_struct *p; struct mutex *mutex; + int owner_cpu; /* Follow blocked_on chain. */ - for (p = donor; task_is_blocked(p); p = owner) { - mutex = p->blocked_on; - /* Something changed in the chain, so pick again */ - if (!mutex) - return NULL; + for (p = donor; (mutex = p->blocked_on); p = owner) { + /* if its PROXY_WAKING, do return migration or run if current */ + if (mutex == PROXY_WAKING) { + if (task_current(rq, p)) { + clear_task_blocked_on(p, PROXY_WAKING); + return p; + } + goto force_return; + } + /* * By taking mutex->wait_lock we hold off concurrent mutex_unlock() * and ensure @owner sticks around. */ guard(raw_spinlock)(&mutex->wait_lock); + guard(raw_spinlock)(&p->blocked_lock); - /* Check again that p is blocked with wait_lock held */ + /* Check again that p is blocked with blocked_lock held */ if (mutex != __get_task_blocked_on(p)) { /* * Something changed in the blocked_on chain and @@ -6619,20 +6861,39 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) return NULL; } + if (task_current(rq, p)) + curr_in_chain = true; + owner = __mutex_owner(mutex); if (!owner) { - __clear_task_blocked_on(p, mutex); - return p; + /* + * If there is no owner, either clear blocked_on + * and return p (if it is current and safe to + * just run on this rq), or return-migrate the task. + */ + if (task_current(rq, p)) { + __clear_task_blocked_on(p, NULL); + return p; + } + goto force_return; } if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { /* XXX Don't handle blocked owners/delayed dequeue yet */ - return proxy_deactivate(rq, donor); + if (curr_in_chain) + return proxy_resched_idle(rq); + goto deactivate; } - if (task_cpu(owner) != this_cpu) { - /* XXX Don't handle migrations yet */ - return proxy_deactivate(rq, donor); + owner_cpu = task_cpu(owner); + if (owner_cpu != this_cpu) { + /* + * @owner can disappear, simply migrate to @owner_cpu + * and leave that CPU to sort things out. + */ + if (curr_in_chain) + return proxy_resched_idle(rq); + goto migrate_task; } if (task_on_rq_migrating(owner)) { @@ -6689,9 +6950,20 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) * guarantee its existence, as per ttwu_remote(). */ } - WARN_ON_ONCE(owner && !owner->on_rq); return owner; + +deactivate: + if (proxy_deactivate(rq, donor)) + return NULL; + /* If deactivate fails, force return */ + p = donor; +force_return: + proxy_force_return(rq, rf, p); + return NULL; +migrate_task: + proxy_migrate_task(rq, rf, p, owner_cpu); + return NULL; } #else /* SCHED_PROXY_EXEC */ static struct task_struct * @@ -6702,23 +6974,6 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) } #endif /* SCHED_PROXY_EXEC */ -static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner) -{ - if (!sched_proxy_exec()) - return; - /* - * pick_next_task() calls set_next_task() on the chosen task - * at some point, which ensures it is not push/pullable. - * However, the chosen/donor task *and* the mutex owner form an - * atomic pair wrt push/pull. - * - * Make sure owner we run is not pushable. Unfortunately we can - * only deal with that by means of a dequeue/enqueue cycle. :-/ - */ - dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE); - enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE); -} - /* * __schedule() is the main scheduler function. * @@ -6782,9 +7037,6 @@ static void __sched notrace __schedule(int sched_mode) schedule_debug(prev, preempt); - if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) - hrtick_clear(rq); - klp_sched_try_switch(prev); local_irq_disable(); @@ -6811,6 +7063,8 @@ static void __sched notrace __schedule(int sched_mode) rq_lock(rq, &rf); smp_mb__after_spinlock(); + hrtick_schedule_enter(rq); + /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq); @@ -6830,6 +7084,7 @@ static void __sched notrace __schedule(int sched_mode) /* SCX must consult the BPF scheduler to tell if rq is empty */ if (!rq->nr_running && !scx_enabled()) { next = prev; + rq->next_class = &idle_sched_class; goto picked; } } else if (!preempt && prev_state) { @@ -6845,16 +7100,45 @@ static void __sched notrace __schedule(int sched_mode) } pick_again: + assert_balance_callbacks_empty(rq); next = pick_next_task(rq, rq->donor, &rf); - rq_set_donor(rq, next); rq->next_class = next->sched_class; - if (unlikely(task_is_blocked(next))) { - next = find_proxy_task(rq, next, &rf); - if (!next) - goto pick_again; - if (next == rq->idle) - goto keep_resched; + if (sched_proxy_exec()) { + struct task_struct *prev_donor = rq->donor; + + rq_set_donor(rq, next); + if (unlikely(next->blocked_on)) { + next = find_proxy_task(rq, next, &rf); + if (!next) { + zap_balance_callbacks(rq); + goto pick_again; + } + if (next == rq->idle) { + zap_balance_callbacks(rq); + goto keep_resched; + } + } + if (rq->donor == prev_donor && prev != next) { + struct task_struct *donor = rq->donor; + /* + * When transitioning like: + * + * prev next + * donor: B B + * curr: A B or C + * + * then put_prev_set_next_task() will not have done + * anything, since B == B. However, A might have + * missed a RT/DL balance opportunity due to being + * on_cpu. + */ + donor->sched_class->put_prev_task(rq, donor, donor); + donor->sched_class->set_next_task(rq, donor, true); + } + } else { + rq_set_donor(rq, next); } + picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); @@ -6870,9 +7154,6 @@ keep_resched: */ RCU_INIT_POINTER(rq->curr, next); - if (!task_current_donor(rq, next)) - proxy_tag_curr(rq, next); - /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -6906,12 +7187,9 @@ keep_resched: /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { - /* In case next was already curr but just got blocked_donor */ - if (!task_current_donor(rq, next)) - proxy_tag_curr(rq, next); - rq_unpin_lock(rq, &rf); __balance_callbacks(rq, NULL); + hrtick_schedule_exit(rq); raw_spin_rq_unlock_irq(rq); } trace_sched_exit_tp(is_switch); @@ -10616,13 +10894,10 @@ static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pc } } -static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) +static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) { /* Remote access to mm::mm_cid::pcpu requires rq_lock */ guard(task_rq_lock)(t); - /* If the task is not active it is not in the users count */ - if (!t->mm_cid.active) - return false; if (cid_on_task(t->mm_cid.cid)) { /* If running on the CPU, put the CID in transit mode, otherwise drop it */ if (task_rq(t)->curr == t) @@ -10630,69 +10905,43 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm else mm_unset_cid_on_task(t); } - return true; } -static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm) +static void mm_cid_fixup_tasks_to_cpus(void) { - struct task_struct *p, *t; - unsigned int users; - - /* - * This can obviously race with a concurrent affinity change, which - * increases the number of allowed CPUs for this mm, but that does - * not affect the mode and only changes the CID constraints. A - * possible switch back to per task mode happens either in the - * deferred handler function or in the next fork()/exit(). - * - * The caller has already transferred. The newly incoming task is - * already accounted for, but not yet visible. - */ - users = mm->mm_cid.users - 2; - if (!users) - return; - - guard(rcu)(); - for_other_threads(current, t) { - if (mm_cid_fixup_task_to_cpu(t, mm)) - users--; - } + struct mm_struct *mm = current->mm; + struct task_struct *t; - if (!users) - return; + lockdep_assert_held(&mm->mm_cid.mutex); - /* Happens only for VM_CLONE processes. */ - for_each_process_thread(p, t) { - if (t == current || t->mm != mm) - continue; - if (mm_cid_fixup_task_to_cpu(t, mm)) { - if (--users == 0) - return; - } + hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) { + /* Current has already transferred before invoking the fixup. */ + if (t != current) + mm_cid_fixup_task_to_cpu(t, mm); } -} - -static void mm_cid_fixup_tasks_to_cpus(void) -{ - struct mm_struct *mm = current->mm; - mm_cid_do_fixup_tasks_to_cpus(mm); mm_cid_complete_transit(mm, MM_CID_ONCPU); } static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) { + lockdep_assert_held(&mm->mm_cid.lock); + t->mm_cid.active = 1; + hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list); mm->mm_cid.users++; return mm_update_max_cids(mm); } -void sched_mm_cid_fork(struct task_struct *t) +static void sched_mm_cid_fork(struct task_struct *t) { struct mm_struct *mm = t->mm; bool percpu; - WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); + if (!mm) + return; + + WARN_ON_ONCE(t->mm_cid.cid != MM_CID_UNSET); guard(mutex)(&mm->mm_cid.mutex); scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { @@ -10731,12 +10980,13 @@ void sched_mm_cid_fork(struct task_struct *t) static bool sched_mm_cid_remove_user(struct task_struct *t) { + lockdep_assert_held(&t->mm->mm_cid.lock); + t->mm_cid.active = 0; - scoped_guard(preempt) { - /* Clear the transition bit */ - t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); - mm_unset_cid_on_task(t); - } + /* Clear the transition bit */ + t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); + mm_unset_cid_on_task(t); + hlist_del_init(&t->mm_cid.node); t->mm->mm_cid.users--; return mm_update_max_cids(t->mm); } @@ -10879,11 +11129,13 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p) mutex_init(&mm->mm_cid.mutex); mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); + INIT_HLIST_HEAD(&mm->mm_cid.user_list); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); bitmap_zero(mm_cidmask(mm), num_possible_cpus()); } #else /* CONFIG_SCHED_MM_CID */ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } #endif /* !CONFIG_SCHED_MM_CID */ static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 153232dd8276..ae9fd211cec1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -461,6 +461,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long prev_util = sg_cpu->util; unsigned long max_cap; @@ -482,10 +483,10 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; - cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, + cpufreq_driver_adjust_perf(sg_policy->policy, sg_cpu->bw_min, sg_cpu->util, max_cap); - sg_cpu->sg_policy->last_freq_update_time = time; + sg_policy->last_freq_update_time = time; } static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d08b00429323..edca7849b165 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -18,6 +18,7 @@ #include <linux/cpuset.h> #include <linux/sched/clock.h> +#include <linux/sched/deadline.h> #include <uapi/linux/sched/types.h> #include "sched.h" #include "pelt.h" @@ -57,17 +58,6 @@ static int __init sched_dl_sysctl_init(void) late_initcall(sched_dl_sysctl_init); #endif /* CONFIG_SYSCTL */ -static bool dl_server(struct sched_dl_entity *dl_se) -{ - return dl_se->dl_server; -} - -static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) -{ - BUG_ON(dl_server(dl_se)); - return container_of(dl_se, struct task_struct, dl); -} - static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) { return container_of(dl_rq, struct rq, dl); @@ -115,6 +105,19 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) } #endif /* !CONFIG_RT_MUTEXES */ +static inline u8 dl_get_type(struct sched_dl_entity *dl_se, struct rq *rq) +{ + if (!dl_server(dl_se)) + return DL_TASK; + if (dl_se == &rq->fair_server) + return DL_SERVER_FAIR; +#ifdef CONFIG_SCHED_CLASS_EXT + if (dl_se == &rq->ext_server) + return DL_SERVER_EXT; +#endif + return DL_OTHER; +} + static inline struct dl_bw *dl_bw_of(int i) { RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), @@ -733,6 +736,7 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, dl_se->dl_throttled = 1; dl_se->dl_defer_armed = 1; } + trace_sched_dl_replenish_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq)); } /* @@ -848,6 +852,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) if (dl_se->dl_throttled) dl_se->dl_throttled = 0; + trace_sched_dl_replenish_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq)); + /* * If this is the replenishment of a deferred reservation, * clear the flag and return. @@ -975,22 +981,6 @@ update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq) } /* - * Regarding the deadline, a task with implicit deadline has a relative - * deadline == relative period. A task with constrained deadline has a - * relative deadline <= relative period. - * - * We support constrained deadline tasks. However, there are some restrictions - * applied only for tasks which do not have an implicit deadline. See - * update_dl_entity() to know more about such restrictions. - * - * The dl_is_implicit() returns true if the task has an implicit deadline. - */ -static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) -{ - return dl_se->dl_deadline == dl_se->dl_period; -} - -/* * When a deadline entity is placed in the runqueue, its runtime and deadline * might need to be updated. This is done by a CBS wake up rule. There are two * different rules: 1) the original CBS; and 2) the Revisited CBS. @@ -1027,7 +1017,7 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, rq_clock(rq))) { - if (unlikely(!dl_is_implicit(dl_se) && + if (unlikely((!dl_is_implicit(dl_se) || dl_se->dl_defer) && !dl_time_before(dl_se->deadline, rq_clock(rq)) && !is_dl_boosted(dl_se))) { update_dl_revised_wakeup(dl_se, rq); @@ -1097,7 +1087,7 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) act = ns_to_ktime(dl_next_period(dl_se)); } - now = hrtimer_cb_get_time(timer); + now = ktime_get(); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -1345,6 +1335,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) return; + trace_sched_dl_throttle_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq)); dl_se->dl_throttled = 1; if (dl_se->runtime > 0) dl_se->runtime = 0; @@ -1508,6 +1499,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { + trace_sched_dl_throttle_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq)); dl_se->dl_throttled = 1; /* If requested, inform the user about runtime overruns. */ @@ -1532,6 +1524,8 @@ throttle: if (!is_leftmost(dl_se, &rq->dl)) resched_curr(rq); + } else { + trace_sched_dl_update_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq)); } /* @@ -1810,6 +1804,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) if (WARN_ON_ONCE(!cpu_online(cpu_of(rq)))) return; + trace_sched_dl_server_start_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq)); dl_se->dl_server_active = 1; enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) @@ -1821,6 +1816,8 @@ void dl_server_stop(struct sched_dl_entity *dl_se) if (!dl_server(dl_se) || !dl_server_active(dl_se)) return; + trace_sched_dl_server_stop_tp(dl_se, cpu_of(dl_se->rq), + dl_get_type(dl_se, dl_se->rq)); dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); hrtimer_try_to_cancel(&dl_se->dl_timer); dl_se->dl_defer_armed = 0; @@ -2142,10 +2139,14 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags) { struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); if (!schedstat_enabled()) return; + if (p != rq->curr) + update_stats_wait_end_dl(dl_rq, dl_se); + if ((flags & DEQUEUE_SLEEP)) { unsigned int state; @@ -2801,12 +2802,26 @@ static int find_later_rq(struct task_struct *task) static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) { - struct task_struct *p; + struct task_struct *i, *p = NULL; + struct rb_node *next_node; if (!has_pushable_dl_tasks(rq)) return NULL; - p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); + next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); + while (next_node) { + i = __node_2_pdl(next_node); + /* make sure task isn't on_cpu (possible with proxy-exec) */ + if (!task_on_cpu(rq, i)) { + p = i; + break; + } + + next_node = rb_next(next_node); + } + + if (!p) + return NULL; WARN_ON_ONCE(rq->cpu != task_cpu(p)); WARN_ON_ONCE(task_current(rq, p)); @@ -3613,13 +3628,26 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } -void __getparam_dl(struct task_struct *p, struct sched_attr *attr) +void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags) { struct sched_dl_entity *dl_se = &p->dl; + struct rq *rq = task_rq(p); + u64 adj_deadline; attr->sched_priority = p->rt_priority; - attr->sched_runtime = dl_se->dl_runtime; - attr->sched_deadline = dl_se->dl_deadline; + if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) { + guard(raw_spinlock_irq)(&rq->__lock); + update_rq_clock(rq); + if (task_current(rq, p)) + update_curr_dl(rq); + + attr->sched_runtime = dl_se->runtime; + adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns(); + attr->sched_deadline = adj_deadline; + } else { + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + } attr->sched_period = dl_se->dl_period; attr->sched_flags &= ~SCHED_DL_FLAGS; attr->sched_flags |= dl_se->flags; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b24f40f05019..74c1617cf652 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -8,6 +8,7 @@ */ #include <linux/debugfs.h> #include <linux/nmi.h> +#include <linux/log2.h> #include "sched.h" /* @@ -901,10 +902,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; + s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread; + s64 zero_vruntime = -1, sum_w_vruntime = -1; + u64 avruntime; struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); + unsigned int sum_shift; unsigned long flags; + u64 sum_weight; #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, "\n"); @@ -925,6 +930,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) if (last) right_vruntime = last->vruntime; zero_vruntime = cfs_rq->zero_vruntime; + sum_w_vruntime = cfs_rq->sum_w_vruntime; + sum_weight = cfs_rq->sum_weight; + sum_shift = cfs_rq->sum_shift; + avruntime = avg_vruntime(cfs_rq); raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", @@ -933,8 +942,13 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(left_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", SPLIT_NS(zero_vruntime)); + SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime", + sum_w_vruntime, ilog2(abs(sum_w_vruntime))); + SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight", + sum_weight); + SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", - SPLIT_NS(avg_vruntime(cfs_rq))); + SPLIT_NS(avruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", SPLIT_NS(right_vruntime)); spread = right_vruntime - left_vruntime; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 62b1f3ac5630..e426e27b6794 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,6 +9,8 @@ #include <linux/btf_ids.h> #include "ext_idle.h" +static DEFINE_RAW_SPINLOCK(scx_sched_lock); + /* * NOTE: sched_ext is in the process of growing multiple scheduler support and * scx_root usage is in a transitional state. Naked dereferences are safe if the @@ -17,7 +19,23 @@ * are used as temporary markers to indicate that the dereferences need to be * updated to point to the associated scheduler instances rather than scx_root. */ -static struct scx_sched __rcu *scx_root; +struct scx_sched __rcu *scx_root; + +/* + * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. + * Readers can hold either or rcu_read_lock(). + */ +static LIST_HEAD(scx_sched_all); + +#ifdef CONFIG_EXT_SUB_SCHED +static const struct rhashtable_params scx_sched_hash_params = { + .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), + .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), + .head_offset = offsetof(struct scx_sched, hash_node), +}; + +static struct rhashtable scx_sched_hash; +#endif /* * During exit, a task may schedule after losing its PIDs. When disabling the @@ -33,37 +51,39 @@ static DEFINE_MUTEX(scx_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); -static int scx_bypass_depth; +static DEFINE_RAW_SPINLOCK(scx_bypass_lock); static cpumask_var_t scx_bypass_lb_donee_cpumask; static cpumask_var_t scx_bypass_lb_resched_cpumask; -static bool scx_aborting; static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); -/* - * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass - * depth on enable failure. Will be removed when bypass depth is moved into the - * sched instance. - */ -static bool scx_bypassed_for_enable; - static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); +#ifdef CONFIG_EXT_SUB_SCHED /* - * A monotically increasing sequence number that is incremented every time a - * scheduler is enabled. This can be used by to check if any custom sched_ext + * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit + * tasks for the sub-sched being enabled. Use a global variable instead of a + * per-task field as all enables are serialized. + */ +static struct scx_sched *scx_enabling_sub_sched; +#else +#define scx_enabling_sub_sched (struct scx_sched *)NULL +#endif /* CONFIG_EXT_SUB_SCHED */ + +/* + * A monotonically increasing sequence number that is incremented every time a + * scheduler is enabled. This can be used to check if any custom sched_ext * scheduler has ever been used in the system. */ static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); /* - * The maximum amount of time in jiffies that a task may be runnable without - * being scheduled on a CPU. If this timeout is exceeded, it will trigger - * scx_error(). + * Watchdog interval. All scx_sched's share a single watchdog timer and the + * interval is half of the shortest sch->watchdog_timeout. */ -static unsigned long scx_watchdog_timeout; +static unsigned long scx_watchdog_interval; /* * The last time the delayed work was run. This delayed work relies on @@ -106,25 +126,6 @@ static const struct rhashtable_params dsq_hash_params = { static LLIST_HEAD(dsqs_to_free); -/* dispatch buf */ -struct scx_dsp_buf_ent { - struct task_struct *task; - unsigned long qseq; - u64 dsq_id; - u64 enq_flags; -}; - -static u32 scx_dsp_max_batch; - -struct scx_dsp_ctx { - struct rq *rq; - u32 cursor; - u32 nr_tasks; - struct scx_dsp_buf_ent buf[]; -}; - -static struct scx_dsp_ctx __percpu *scx_dsp_ctx; - /* string formatting from BPF */ struct scx_bstr_buf { u64 data[MAX_BPRINTF_VARARGS]; @@ -135,6 +136,8 @@ static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); static struct scx_bstr_buf scx_exit_bstr_buf; /* ops debug dump */ +static DEFINE_RAW_SPINLOCK(scx_dump_lock); + struct scx_dump_data { s32 cpu; bool first; @@ -156,7 +159,6 @@ static struct kset *scx_kset; * There usually is no reason to modify these as normal scheduler operation * shouldn't be affected by them. The knobs are primarily for debugging. */ -static u64 scx_slice_dfl = SCX_SLICE_DFL; static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; @@ -193,10 +195,10 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond #define CREATE_TRACE_POINTS #include <trace/events/sched_ext.h> -static void process_ddsp_deferred_locals(struct rq *rq); +static void run_deferred(struct rq *rq); static bool task_dead_and_done(struct task_struct *p); -static u32 reenq_local(struct rq *rq); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); +static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); @@ -227,28 +229,109 @@ static long jiffies_delta_msecs(unsigned long at, unsigned long now) return -(long)jiffies_to_msecs(now - at); } -/* if the highest set bit is N, return a mask with bits [N+1, 31] set */ -static u32 higher_bits(u32 flags) +static bool u32_before(u32 a, u32 b) +{ + return (s32)(a - b) < 0; +} + +#ifdef CONFIG_EXT_SUB_SCHED +/** + * scx_parent - Find the parent sched + * @sch: sched to find the parent of + * + * Returns the parent scheduler or %NULL if @sch is root. + */ +static struct scx_sched *scx_parent(struct scx_sched *sch) { - return ~((1 << fls(flags)) - 1); + if (sch->level) + return sch->ancestors[sch->level - 1]; + else + return NULL; } -/* return the mask with only the highest bit set */ -static u32 highest_bit(u32 flags) +/** + * scx_next_descendant_pre - find the next descendant for pre-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: sched whose descendants to walk + * + * To be used by scx_for_each_descendant_pre(). Find the next descendant to + * visit for pre-order traversal of @root's descendants. @root is included in + * the iteration and the first node to be visited. + */ +static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, + struct scx_sched *root) { - int bit = fls(flags); - return ((u64)1 << bit) >> 1; + struct scx_sched *next; + + lockdep_assert(lockdep_is_held(&scx_enable_mutex) || + lockdep_is_held(&scx_sched_lock)); + + /* if first iteration, visit @root */ + if (!pos) + return root; + + /* visit the first child if exists */ + next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling); + if (next) + return next; + + /* no child, visit my or the closest ancestor's next sibling */ + while (pos != root) { + if (!list_is_last(&pos->sibling, &scx_parent(pos)->children)) + return list_next_entry(pos, sibling); + pos = scx_parent(pos); + } + + return NULL; } -static bool u32_before(u32 a, u32 b) +static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { - return (s32)(a - b) < 0; + return rhashtable_lookup(&scx_sched_hash, &cgroup_id, + scx_sched_hash_params); } -static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, - struct task_struct *p) +static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) { - return sch->global_dsqs[cpu_to_node(task_cpu(p))]; + rcu_assign_pointer(p->scx.sched, sch); +} +#else /* CONFIG_EXT_SUB_SCHED */ +static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } +static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } +static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { return NULL; } +static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} +#endif /* CONFIG_EXT_SUB_SCHED */ + +/** + * scx_is_descendant - Test whether sched is a descendant + * @sch: sched to test + * @ancestor: ancestor sched to test against + * + * Test whether @sch is a descendant of @ancestor. + */ +static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor) +{ + if (sch->level < ancestor->level) + return false; + return sch->ancestors[ancestor->level] == ancestor; +} + +/** + * scx_for_each_descendant_pre - pre-order walk of a sched's descendants + * @pos: iteration cursor + * @root: sched to walk the descendants of + * + * Walk @root's descendants. @root is included in the iteration and the first + * node to be visited. Must be called with either scx_enable_mutex or + * scx_sched_lock held. + */ +#define scx_for_each_descendant_pre(pos, root) \ + for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \ + (pos) = scx_next_descendant_pre((pos), (root))) + +static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, s32 cpu) +{ + return &sch->pnode[cpu_to_node(cpu)]->global_dsq; } static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) @@ -264,28 +347,106 @@ static const struct sched_class *scx_setscheduler_class(struct task_struct *p) return __setscheduler_class(p->policy, p->prio); } -/* - * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX - * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate - * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check - * whether it's running from an allowed context. +static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu) +{ + return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq; +} + +static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu) +{ +#ifdef CONFIG_EXT_SUB_SCHED + /* + * If @sch is a sub-sched which is bypassing, its tasks should go into + * the bypass DSQs of the nearest ancestor which is not bypassing. The + * not-bypassing ancestor is responsible for scheduling all tasks from + * bypassing sub-trees. If all ancestors including root are bypassing, + * all tasks should go to the root's bypass DSQs. + * + * Whenever a sched starts bypassing, all runnable tasks in its subtree + * are re-enqueued after scx_bypassing() is turned on, guaranteeing that + * all tasks are transferred to the right DSQs. + */ + while (scx_parent(sch) && scx_bypassing(sch, cpu)) + sch = scx_parent(sch); +#endif /* CONFIG_EXT_SUB_SCHED */ + + return bypass_dsq(sch, cpu); +} + +/** + * bypass_dsp_enabled - Check if bypass dispatch path is enabled + * @sch: scheduler to check + * + * When a descendant scheduler enters bypass mode, bypassed tasks are scheduled + * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors + * are bypassing. In the former case, the ancestor is not itself bypassing but + * its bypass DSQs will be populated with bypassed tasks from descendants. Thus, + * the ancestor's bypass dispatch path must be active even though its own + * bypass_depth remains zero. * - * @mask is constant, always inline to cull the mask calculations. + * This function checks bypass_dsp_enable_depth which is managed separately from + * bypass_depth to enable this decoupling. See enable_bypass_dsp() and + * disable_bypass_dsp(). */ -static __always_inline void scx_kf_allow(u32 mask) +static bool bypass_dsp_enabled(struct scx_sched *sch) { - /* nesting is allowed only in increasing scx_kf_mask order */ - WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, - "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", - current->scx.kf_mask, mask); - current->scx.kf_mask |= mask; - barrier(); + return unlikely(atomic_read(&sch->bypass_dsp_enable_depth)); } -static void scx_kf_disallow(u32 mask) +/** + * rq_is_open - Is the rq available for immediate execution of an SCX task? + * @rq: rq to test + * @enq_flags: optional %SCX_ENQ_* of the task being enqueued + * + * Returns %true if @rq is currently open for executing an SCX task. After a + * %false return, @rq is guaranteed to invoke SCX dispatch path at least once + * before going to idle and not inserting a task into @rq's local DSQ after a + * %false return doesn't cause @rq to stall. + */ +static bool rq_is_open(struct rq *rq, u64 enq_flags) { - barrier(); - current->scx.kf_mask &= ~mask; + lockdep_assert_rq_held(rq); + + /* + * A higher-priority class task is either running or in the process of + * waking up on @rq. + */ + if (sched_class_above(rq->next_class, &ext_sched_class)) + return false; + + /* + * @rq is either in transition to or in idle and there is no + * higher-priority class task waking up on it. + */ + if (sched_class_above(&ext_sched_class, rq->next_class)) + return true; + + /* + * @rq is either picking, in transition to, or running an SCX task. + */ + + /* + * If we're in the dispatch path holding rq lock, $curr may or may not + * be ready depending on whether the on-going dispatch decides to extend + * $curr's slice. We say yes here and resolve it at the end of dispatch. + * See balance_one(). + */ + if (rq->scx.flags & SCX_RQ_IN_BALANCE) + return true; + + /* + * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch, + * so allow it to avoid spuriously triggering reenq on a combined + * PREEMPT|IMMED insertion. + */ + if (enq_flags & SCX_ENQ_PREEMPT) + return true; + + /* + * @rq is either in transition to or running an SCX task and can't go + * idle without another SCX dispatch cycle. + */ + return false; } /* @@ -308,119 +469,77 @@ static inline void update_locked_rq(struct rq *rq) __this_cpu_write(scx_locked_rq_state, rq); } -#define SCX_CALL_OP(sch, mask, op, rq, args...) \ +#define SCX_CALL_OP(sch, op, rq, args...) \ do { \ if (rq) \ update_locked_rq(rq); \ - if (mask) { \ - scx_kf_allow(mask); \ - (sch)->ops.op(args); \ - scx_kf_disallow(mask); \ - } else { \ - (sch)->ops.op(args); \ - } \ + (sch)->ops.op(args); \ if (rq) \ update_locked_rq(NULL); \ } while (0) -#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \ +#define SCX_CALL_OP_RET(sch, op, rq, args...) \ ({ \ __typeof__((sch)->ops.op(args)) __ret; \ \ if (rq) \ update_locked_rq(rq); \ - if (mask) { \ - scx_kf_allow(mask); \ - __ret = (sch)->ops.op(args); \ - scx_kf_disallow(mask); \ - } else { \ - __ret = (sch)->ops.op(args); \ - } \ + __ret = (sch)->ops.op(args); \ if (rq) \ update_locked_rq(NULL); \ __ret; \ }) /* - * Some kfuncs are allowed only on the tasks that are subjects of the - * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such - * restrictions, the following SCX_CALL_OP_*() variants should be used when - * invoking scx_ops operations that take task arguments. These can only be used - * for non-nesting operations due to the way the tasks are tracked. - * - * kfuncs which can only operate on such tasks can in turn use - * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on - * the specific task. + * SCX_CALL_OP_TASK*() invokes an SCX op that takes one or two task arguments + * and records them in current->scx.kf_tasks[] for the duration of the call. A + * kfunc invoked from inside such an op can then use + * scx_kf_arg_task_ok() to verify that its task argument is one of + * those subject tasks. + * + * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - + * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock + * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if + * kf_tasks[] is set, @p's scheduler-protected fields are stable. + * + * kf_tasks[] can not stack, so task-based SCX ops must not nest. The + * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants + * while a previous one is still in progress. */ -#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK(sch, op, rq, task, args...) \ do { \ - BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP((sch), mask, op, rq, task, ##args); \ + SCX_CALL_OP((sch), op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...) \ ({ \ __typeof__((sch)->ops.op(task, ##args)) __ret; \ - BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...) \ ({ \ __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ - BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \ }) -/* @mask is constant, always inline to cull unnecessary branches */ -static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask) -{ - if (unlikely(!(current->scx.kf_mask & mask))) { - scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x", - mask, current->scx.kf_mask); - return false; - } - - /* - * Enforce nesting boundaries. e.g. A kfunc which can be called from - * DISPATCH must not be called if we're running DEQUEUE which is nested - * inside ops.dispatch(). We don't need to check boundaries for any - * blocking kfuncs as the verifier ensures they're only called from - * sleepable progs. - */ - if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && - (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { - scx_error(sch, "cpu_release kfunc called from a nested operation"); - return false; - } - - if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && - (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { - scx_error(sch, "dispatch kfunc called from a nested operation"); - return false; - } - - return true; -} - /* see SCX_CALL_OP_TASK() */ -static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, - u32 mask, +static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch, struct task_struct *p) { - if (!scx_kf_allowed(sch, mask)) - return false; - if (unlikely((p != current->scx.kf_tasks[0] && p != current->scx.kf_tasks[1]))) { scx_error(sch, "called on a task not being operated on"); @@ -430,9 +549,22 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, return true; } +enum scx_dsq_iter_flags { + /* iterate in the reverse dispatch order */ + SCX_DSQ_ITER_REV = 1U << 16, + + __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, + __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, + + __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, + __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | + __SCX_DSQ_ITER_HAS_SLICE | + __SCX_DSQ_ITER_HAS_VTIME, +}; + /** * nldsq_next_task - Iterate to the next task in a non-local DSQ - * @dsq: user dsq being iterated + * @dsq: non-local dsq being iterated * @cur: current position, %NULL to start iteration * @rev: walk backwards * @@ -472,6 +604,85 @@ static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ (p) = nldsq_next_task((dsq), (p), false)) +/** + * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ + * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() + * @dsq: non-local dsq being iterated + * + * Find the next task in a cursor based iteration. The caller must have + * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock + * between the iteration steps. + * + * Only tasks which were queued before @cursor was initialized are visible. This + * bounds the iteration and guarantees that vtime never jumps in the other + * direction while iterating. + */ +static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor, + struct scx_dispatch_q *dsq) +{ + bool rev = cursor->flags & SCX_DSQ_ITER_REV; + struct task_struct *p; + + lockdep_assert_held(&dsq->lock); + BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR)); + + if (list_empty(&cursor->node)) + p = NULL; + else + p = container_of(cursor, struct task_struct, scx.dsq_list); + + /* skip cursors and tasks that were queued after @cursor init */ + do { + p = nldsq_next_task(dsq, p, rev); + } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq))); + + if (p) { + if (rev) + list_move_tail(&cursor->node, &p->scx.dsq_list.node); + else + list_move(&cursor->node, &p->scx.dsq_list.node); + } else { + list_del_init(&cursor->node); + } + + return p; +} + +/** + * nldsq_cursor_lost_task - Test whether someone else took the task since iteration + * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() + * @rq: rq @p was on + * @dsq: dsq @p was on + * @p: target task + * + * @p is a task returned by nldsq_cursor_next_task(). The locks may have been + * dropped and re-acquired inbetween. Verify that no one else took or is in the + * process of taking @p from @dsq. + * + * On %false return, the caller can assume full ownership of @p. + */ +static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor, + struct rq *rq, struct scx_dispatch_q *dsq, + struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + lockdep_assert_held(&dsq->lock); + + /* + * @p could have already left $src_dsq, got re-enqueud, or be in the + * process of being consumed by someone else. + */ + if (unlikely(p->scx.dsq != dsq || + u32_before(cursor->priv, p->scx.dsq_seq) || + p->scx.holding_cpu >= 0)) + return true; + + /* if @p has stayed on @dsq, its rq couldn't have changed */ + if (WARN_ON_ONCE(rq != task_rq(p))) + return true; + + return false; +} /* * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] @@ -479,19 +690,6 @@ static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, * changes without breaking backward compatibility. Can be used with * bpf_for_each(). See bpf_iter_scx_dsq_*(). */ -enum scx_dsq_iter_flags { - /* iterate in the reverse dispatch order */ - SCX_DSQ_ITER_REV = 1U << 16, - - __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, - __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, - - __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, - __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | - __SCX_DSQ_ITER_HAS_SLICE | - __SCX_DSQ_ITER_HAS_VTIME, -}; - struct bpf_iter_scx_dsq_kern { struct scx_dsq_list_node cursor; struct scx_dispatch_q *dsq; @@ -514,14 +712,31 @@ struct scx_task_iter { struct rq_flags rf; u32 cnt; bool list_locked; +#ifdef CONFIG_EXT_SUB_SCHED + struct cgroup *cgrp; + struct cgroup_subsys_state *css_pos; + struct css_task_iter css_iter; +#endif }; /** * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration * @iter: iterator to init + * @cgrp: Optional root of cgroup subhierarchy to iterate + * + * Initialize @iter. Once initialized, @iter must eventually be stopped with + * scx_task_iter_stop(). + * + * If @cgrp is %NULL, scx_tasks is used for iteration and this function returns + * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks. * - * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter - * must eventually be stopped with scx_task_iter_stop(). + * If @cgrp is not %NULL, @cgrp and its descendants' tasks are walked using + * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup + * task migrations. + * + * The two modes of iterations are largely independent and it's likely that + * scx_tasks can be removed in favor of always using cgroup iteration if + * CONFIG_SCHED_CLASS_EXT depends on CONFIG_CGROUPS. * * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() * between this and the first next() call or between any two next() calls. If @@ -532,10 +747,19 @@ struct scx_task_iter { * All tasks which existed when the iteration started are guaranteed to be * visited as long as they are not dead. */ -static void scx_task_iter_start(struct scx_task_iter *iter) +static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) { memset(iter, 0, sizeof(*iter)); +#ifdef CONFIG_EXT_SUB_SCHED + if (cgrp) { + lockdep_assert_held(&cgroup_mutex); + iter->cgrp = cgrp; + iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); + css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + return; + } +#endif raw_spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; @@ -588,6 +812,14 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) */ static void scx_task_iter_stop(struct scx_task_iter *iter) { +#ifdef CONFIG_EXT_SUB_SCHED + if (iter->cgrp) { + if (iter->css_pos) + css_task_iter_end(&iter->css_iter); + __scx_task_iter_rq_unlock(iter); + return; + } +#endif __scx_task_iter_maybe_relock(iter); list_del_init(&iter->cursor.tasks_node); scx_task_iter_unlock(iter); @@ -611,6 +843,24 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) cond_resched(); } +#ifdef CONFIG_EXT_SUB_SCHED + if (iter->cgrp) { + while (iter->css_pos) { + struct task_struct *p; + + p = css_task_iter_next(&iter->css_iter); + if (p) + return p; + + css_task_iter_end(&iter->css_iter); + iter->css_pos = css_next_descendant_pre(iter->css_pos, + &iter->cgrp->self); + if (iter->css_pos) + css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + } + return NULL; + } +#endif __scx_task_iter_maybe_relock(iter); list_for_each_entry(pos, cursor, tasks_node) { @@ -810,16 +1060,6 @@ static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err return -EPROTO; } -static void run_deferred(struct rq *rq) -{ - process_ddsp_deferred_locals(rq); - - if (local_read(&rq->scx.reenq_local_deferred)) { - local_set(&rq->scx.reenq_local_deferred, 0); - reenq_local(rq); - } -} - static void deferred_bal_cb_workfn(struct rq *rq) { run_deferred(rq); @@ -845,10 +1085,18 @@ static void deferred_irq_workfn(struct irq_work *irq_work) static void schedule_deferred(struct rq *rq) { /* - * Queue an irq work. They are executed on IRQ re-enable which may take - * a bit longer than the scheduler hook in schedule_deferred_locked(). + * This is the fallback when schedule_deferred_locked() can't use + * the cheaper balance callback or wakeup hook paths (the target + * CPU is not in balance or wakeup). Currently, this is primarily + * hit by reenqueue operations targeting a remote CPU. + * + * Queue on the target CPU. The deferred work can run from any CPU + * correctly - the _locked() path already processes remote rqs from + * the calling CPU - but targeting the owning CPU allows IPI delivery + * without waiting for the calling CPU to re-enable IRQs and is + * cheaper as the reenqueue runs locally. */ - irq_work_queue(&rq->scx.deferred_irq_work); + irq_work_queue_on(&rq->scx.deferred_irq_work, cpu_of(rq)); } /** @@ -898,6 +1146,81 @@ static void schedule_deferred_locked(struct rq *rq) schedule_deferred(rq); } +static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq, + u64 reenq_flags, struct rq *locked_rq) +{ + struct rq *rq; + + /* + * Allowing reenqueues doesn't make sense while bypassing. This also + * blocks from new reenqueues to be scheduled on dead scheds. + */ + if (unlikely(READ_ONCE(sch->bypass_depth))) + return; + + if (dsq->id == SCX_DSQ_LOCAL) { + rq = container_of(dsq, struct rq, scx.local_dsq); + + struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq)); + struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local; + + /* + * Pairs with smp_mb() in process_deferred_reenq_locals() and + * guarantees that there is a reenq_local() afterwards. + */ + smp_mb(); + + if (list_empty(&drl->node) || + (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) { + + guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); + + if (list_empty(&drl->node)) + list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals); + WRITE_ONCE(drl->flags, drl->flags | reenq_flags); + } + } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { + rq = this_rq(); + + struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); + struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; + + /* + * Pairs with smp_mb() in process_deferred_reenq_users() and + * guarantees that there is a reenq_user() afterwards. + */ + smp_mb(); + + if (list_empty(&dru->node) || + (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) { + + guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); + + if (list_empty(&dru->node)) + list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); + WRITE_ONCE(dru->flags, dru->flags | reenq_flags); + } + } else { + scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); + return; + } + + if (rq == locked_rq) + schedule_deferred_locked(rq); + else + schedule_deferred(rq); +} + +static void schedule_reenq_local(struct rq *rq, u64 reenq_flags) +{ + struct scx_sched *root = rcu_dereference_sched(scx_root); + + if (WARN_ON_ONCE(!root)) + return; + + schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags, rq); +} + /** * touch_core_sched - Update timestamp used for core-sched task ordering * @rq: rq to read clock from, must be locked @@ -974,24 +1297,105 @@ static bool scx_dsq_priq_less(struct rb_node *node_a, return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); } -static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) +static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) { /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ - WRITE_ONCE(dsq->nr, dsq->nr + delta); + WRITE_ONCE(dsq->nr, dsq->nr + 1); + + /* + * Once @p reaches a local DSQ, it can only leave it by being dispatched + * to the CPU or dequeued. In both cases, the only way @p can go back to + * the BPF sched is through enqueueing. If being inserted into a local + * DSQ with IMMED, persist the state until the next enqueueing event in + * do_enqueue_task() so that we can maintain IMMED protection through + * e.g. SAVE/RESTORE cycles and slice extensions. + */ + if (enq_flags & SCX_ENQ_IMMED) { + if (unlikely(dsq->id != SCX_DSQ_LOCAL)) { + WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK)); + return; + } + p->scx.flags |= SCX_TASK_IMMED; + } + + if (p->scx.flags & SCX_TASK_IMMED) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + + if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) + return; + + rq->scx.nr_immed++; + + /* + * If @rq already had other tasks or the current task is not + * done yet, @p can't go on the CPU immediately. Re-enqueue. + */ + if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags))) + schedule_reenq_local(rq, 0); + } +} + +static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p) +{ + /* see dsq_inc_nr() */ + WRITE_ONCE(dsq->nr, dsq->nr - 1); + + if (p->scx.flags & SCX_TASK_IMMED) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + + if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) || + WARN_ON_ONCE(rq->scx.nr_immed <= 0)) + return; + + rq->scx.nr_immed--; + } } static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) { - p->scx.slice = READ_ONCE(scx_slice_dfl); + p->scx.slice = READ_ONCE(sch->slice_dfl); __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } +/* + * Return true if @p is moving due to an internal SCX migration, false + * otherwise. + */ +static inline bool task_scx_migrating(struct task_struct *p) +{ + /* + * We only need to check sticky_cpu: it is set to the destination + * CPU in move_remote_task_to_local_dsq() before deactivate_task() + * and cleared when the task is enqueued on the destination, so it + * is only non-negative during an internal SCX migration. + */ + return p->scx.sticky_cpu >= 0; +} + +/* + * Call ops.dequeue() if the task is in BPF custody and not migrating. + * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked. + */ +static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, + struct task_struct *p, u64 deq_flags) +{ + if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p)) + return; + + if (SCX_HAS_OP(sch, dequeue)) + SCX_CALL_OP_TASK(sch, dequeue, rq, p, deq_flags); + + p->scx.flags &= ~SCX_TASK_IN_CUSTODY; +} + static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) { struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); bool preempt = false; + call_task_dequeue(scx_root, rq, p, 0); + /* * If @rq is in balance, the CPU is already vacant and looking for the * next task to run. No need to preempt or trigger resched after moving @@ -1010,8 +1414,9 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p resched_curr(rq); } -static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, - struct task_struct *p, u64 enq_flags) +static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, + struct scx_dispatch_q *dsq, struct task_struct *p, + u64 enq_flags) { bool is_local = dsq->id == SCX_DSQ_LOCAL; @@ -1027,7 +1432,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = find_global_dsq(sch, p); + dsq = find_global_dsq(sch, task_cpu(p)); raw_spin_lock(&dsq->lock); } } @@ -1099,20 +1504,33 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, } /* seq records the order tasks are queued, used by BPF DSQ iterator */ - dsq->seq++; + WRITE_ONCE(dsq->seq, dsq->seq + 1); p->scx.dsq_seq = dsq->seq; - dsq_mod_nr(dsq, 1); + dsq_inc_nr(dsq, p, enq_flags); p->scx.dsq = dsq; /* - * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the - * direct dispatch path, but we clear them here because the direct - * dispatch verdict may be overridden on the enqueue path during e.g. - * bypass. + * Update custody and call ops.dequeue() before clearing ops_state: + * once ops_state is cleared, waiters in ops_dequeue() can proceed + * and dequeue_task_scx() will RMW p->scx.flags. If we clear + * ops_state first, both sides would modify p->scx.flags + * concurrently in a non-atomic way. */ - p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; - p->scx.ddsp_enq_flags = 0; + if (is_local) { + local_dsq_post_enq(dsq, p, enq_flags); + } else { + /* + * Task on global/bypass DSQ: leave custody, task on + * non-terminal DSQ: enter custody. + */ + if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS) + call_task_dequeue(sch, rq, p, 0); + else + p->scx.flags |= SCX_TASK_IN_CUSTODY; + + raw_spin_unlock(&dsq->lock); + } /* * We're transitioning out of QUEUEING or DISPATCHING. store_release to @@ -1120,11 +1538,6 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, */ if (enq_flags & SCX_ENQ_CLEAR_OPSS) atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); - - if (is_local) - local_dsq_post_enq(dsq, p, enq_flags); - else - raw_spin_unlock(&dsq->lock); } static void task_unlink_from_dsq(struct task_struct *p, @@ -1139,7 +1552,7 @@ static void task_unlink_from_dsq(struct task_struct *p, } list_del_init(&p->scx.dsq_list.node); - dsq_mod_nr(dsq, -1); + dsq_dec_nr(dsq, p); if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { struct task_struct *first_task; @@ -1218,7 +1631,7 @@ static void dispatch_dequeue_locked(struct task_struct *p, static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, struct rq *rq, u64 dsq_id, - struct task_struct *p) + s32 tcpu) { struct scx_dispatch_q *dsq; @@ -1229,20 +1642,19 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return find_global_dsq(sch, p); + return find_global_dsq(sch, tcpu); return &cpu_rq(cpu)->scx.local_dsq; } if (dsq_id == SCX_DSQ_GLOBAL) - dsq = find_global_dsq(sch, p); + dsq = find_global_dsq(sch, tcpu); else dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { - scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", - dsq_id, p->comm, p->pid); - return find_global_dsq(sch, p); + scx_error(sch, "non-existent DSQ 0x%llx", dsq_id); + return find_global_dsq(sch, tcpu); } return dsq; @@ -1279,12 +1691,34 @@ static void mark_direct_dispatch(struct scx_sched *sch, p->scx.ddsp_enq_flags = enq_flags; } +/* + * Clear @p direct dispatch state when leaving the scheduler. + * + * Direct dispatch state must be cleared in the following cases: + * - direct_dispatch(): cleared on the synchronous enqueue path, deferred + * dispatch keeps the state until consumed + * - process_ddsp_deferred_locals(): cleared after consuming deferred state, + * - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch + * verdict is ignored (local/global/bypass) + * - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred + * cancellation and holding_cpu races + * - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by + * the scx_bypass() loop, so that stale state is not reused by a subsequent + * scheduler instance + */ +static inline void clear_direct_dispatch(struct task_struct *p) +{ + p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; + p->scx.ddsp_enq_flags = 0; +} + static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, u64 enq_flags) { struct rq *rq = task_rq(p); struct scx_dispatch_q *dsq = - find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); + find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p)); + u64 ddsp_enq_flags; touch_core_sched_dispatch(rq, p); @@ -1325,8 +1759,10 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, return; } - dispatch_enqueue(sch, dsq, p, - p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); + ddsp_enq_flags = p->scx.ddsp_enq_flags; + clear_direct_dispatch(p); + + dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); } static bool scx_rq_online(struct rq *rq) @@ -1344,18 +1780,26 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); struct task_struct **ddsp_taskp; struct scx_dispatch_q *dsq; unsigned long qseq; WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); - /* rq migration */ + /* internal movements - rq migration / RESTORE */ if (sticky_cpu == cpu_of(rq)) goto local_norefill; /* + * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr(). + * Note that exiting and migration-disabled tasks that skip + * ops.enqueue() below will lose IMMED protection unless + * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set. + */ + p->scx.flags &= ~SCX_TASK_IMMED; + + /* * If !scx_rq_online(), we already told the BPF scheduler that the CPU * is offline and are just running the hotplug path. Don't bother the * BPF scheduler. @@ -1363,7 +1807,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (scx_rq_bypassing(rq)) { + if (scx_bypassing(sch, cpu_of(rq))) { __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); goto bypass; } @@ -1398,13 +1842,19 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags); + SCX_CALL_OP_TASK(sch, enqueue, rq, p, enq_flags); *ddsp_taskp = NULL; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct; /* + * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY + * so ops.dequeue() is called when it leaves custody. + */ + p->scx.flags |= SCX_TASK_IN_CUSTODY; + + /* * If not directly dispatched, QUEUEING isn't clear yet and dispatch or * dequeue may be waiting. The store_release matches their load_acquire. */ @@ -1415,16 +1865,16 @@ direct: direct_dispatch(sch, p, enq_flags); return; local_norefill: - dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); + dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags); return; local: dsq = &rq->scx.local_dsq; goto enqueue; global: - dsq = find_global_dsq(sch, p); + dsq = find_global_dsq(sch, task_cpu(p)); goto enqueue; bypass: - dsq = &task_rq(p)->scx.bypass_dsq; + dsq = bypass_enq_target_dsq(sch, task_cpu(p)); goto enqueue; enqueue: @@ -1435,7 +1885,8 @@ enqueue: */ touch_core_sched(rq, p); refill_task_slice_dfl(sch, p); - dispatch_enqueue(sch, dsq, p, enq_flags); + clear_direct_dispatch(p); + dispatch_enqueue(sch, rq, dsq, p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -1466,19 +1917,15 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; } -static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) +static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); int sticky_cpu = p->scx.sticky_cpu; + u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags; if (enq_flags & ENQUEUE_WAKEUP) rq->scx.flags |= SCX_RQ_IN_WAKEUP; - enq_flags |= rq->scx.extra_enq_flags; - - if (sticky_cpu >= 0) - p->scx.sticky_cpu = -1; - /* * Restoring a running task will be immediately followed by * set_next_task_scx() which expects the task to not be on the BPF @@ -1499,7 +1946,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags add_nr_running(rq, 1); if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags); + SCX_CALL_OP_TASK(sch, runnable, rq, p, enq_flags); if (enq_flags & SCX_ENQ_WAKEUP) touch_core_sched(rq, p); @@ -1509,6 +1956,9 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags dl_server_start(&rq->ext_server); do_enqueue_task(rq, p, enq_flags, sticky_cpu); + + if (sticky_cpu >= 0) + p->scx.sticky_cpu = -1; out: rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; @@ -1519,7 +1969,7 @@ out: static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); unsigned long opss; /* dequeue is always temporary, don't reset runnable_at */ @@ -1538,10 +1988,8 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) */ BUG(); case SCX_OPSS_QUEUED: - if (SCX_HAS_OP(sch, dequeue)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, - p, deq_flags); - + /* A queued task must always be in BPF scheduler's custody */ + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY)); if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, SCX_OPSS_NONE)) break; @@ -1564,11 +2012,35 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); break; } + + /* + * Call ops.dequeue() if the task is still in BPF custody. + * + * The code that clears ops_state to %SCX_OPSS_NONE does not always + * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when + * we're moving a task that was in %SCX_OPSS_DISPATCHING to a + * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE + * so that a concurrent dequeue can proceed, but we clear + * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the + * task. So we can see NONE + IN_CUSTODY here and we must handle + * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see + * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until + * it is enqueued on the destination. + */ + call_task_dequeue(sch, rq, p, deq_flags); } -static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) +static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int core_deq_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); + u64 deq_flags = core_deq_flags; + + /* + * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property + * change (not sleep or core-sched pick). + */ + if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC))) + deq_flags |= SCX_DEQ_SCHED_CHANGE; if (!(p->scx.flags & SCX_TASK_QUEUED)) { WARN_ON_ONCE(task_runnable(p)); @@ -1591,11 +2063,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags */ if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { update_curr_scx(rq); - SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false); + SCX_CALL_OP_TASK(sch, stopping, rq, p, false); } if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags); + SCX_CALL_OP_TASK(sch, quiescent, rq, p, deq_flags); if (deq_flags & SCX_DEQ_SLEEP) p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; @@ -1607,32 +2079,56 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags sub_nr_running(rq, 1); dispatch_dequeue(rq, p); + clear_direct_dispatch(p); return true; } static void yield_task_scx(struct rq *rq) { - struct scx_sched *sch = scx_root; struct task_struct *p = rq->donor; + struct scx_sched *sch = scx_task_sched(p); if (SCX_HAS_OP(sch, yield)) - SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); + SCX_CALL_OP_2TASKS_RET(sch, yield, rq, p, NULL); else p->scx.slice = 0; } static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) { - struct scx_sched *sch = scx_root; struct task_struct *from = rq->donor; + struct scx_sched *sch = scx_task_sched(from); - if (SCX_HAS_OP(sch, yield)) - return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, - from, to); + if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to)) + return SCX_CALL_OP_2TASKS_RET(sch, yield, rq, from, to); else return false; } +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) +{ + /* + * Preemption between SCX tasks is implemented by resetting the victim + * task's slice to 0 and triggering reschedule on the target CPU. + * Nothing to do. + */ + if (p->sched_class == &ext_sched_class) + return; + + /* + * Getting preempted by a higher-priority class. Reenqueue IMMED tasks. + * This captures all preemption cases including: + * + * - A SCX task is currently running. + * + * - @rq is waking from idle due to a SCX task waking to it. + * + * - A higher-priority wakes up while SCX dispatch is in progress. + */ + if (rq->scx.nr_immed) + schedule_reenq_local(rq, 0); +} + static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *src_dsq, struct rq *dst_rq) @@ -1650,7 +2146,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, else list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); - dsq_mod_nr(dst_dsq, 1); + dsq_inc_nr(dst_dsq, p, enq_flags); p->scx.dsq = dst_dsq; local_dsq_post_enq(dst_dsq, p, enq_flags); @@ -1670,10 +2166,13 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, { lockdep_assert_rq_held(src_rq); - /* the following marks @p MIGRATING which excludes dequeue */ + /* + * Set sticky_cpu before deactivate_task() to properly mark the + * beginning of an SCX-internal migration. + */ + p->scx.sticky_cpu = cpu_of(dst_rq); deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu_of(dst_rq)); - p->scx.sticky_cpu = cpu_of(dst_rq); raw_spin_rq_unlock(src_rq); raw_spin_rq_lock(dst_rq); @@ -1713,7 +2212,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { - int cpu = cpu_of(rq); + s32 cpu = cpu_of(rq); WARN_ON_ONCE(task_cpu(p) == cpu); @@ -1807,13 +2306,14 @@ static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, !WARN_ON_ONCE(src_rq != task_rq(p)); } -static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, +static bool consume_remote_task(struct rq *this_rq, + struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *dsq, struct rq *src_rq) { raw_spin_rq_unlock(this_rq); if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { - move_remote_task_to_local_dsq(p, 0, src_rq, this_rq); + move_remote_task_to_local_dsq(p, enq_flags, src_rq, this_rq); return true; } else { raw_spin_rq_unlock(src_rq); @@ -1853,8 +2353,9 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dst_dsq = find_global_dsq(sch, p); + dst_dsq = find_global_dsq(sch, task_cpu(p)); dst_rq = src_rq; + enq_flags |= SCX_ENQ_GDSQ_FALLBACK; } } else { /* no need to migrate if destination is a non-local DSQ */ @@ -1885,14 +2386,14 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, dispatch_dequeue_locked(p, src_dsq); raw_spin_unlock(&src_dsq->lock); - dispatch_enqueue(sch, dst_dsq, p, enq_flags); + dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags); } return dst_rq; } static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, - struct scx_dispatch_q *dsq) + struct scx_dispatch_q *dsq, u64 enq_flags) { struct task_struct *p; retry: @@ -1917,18 +2418,18 @@ retry: * the system into the bypass mode. This can easily live-lock the * machine. If aborting, exit from all non-bypass DSQs. */ - if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS) + if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS) break; if (rq == task_rq) { task_unlink_from_dsq(p, dsq); - move_local_task_to_local_dsq(p, 0, dsq, rq); + move_local_task_to_local_dsq(p, enq_flags, dsq, rq); raw_spin_unlock(&dsq->lock); return true; } if (task_can_run_on_remote_rq(sch, p, rq, false)) { - if (likely(consume_remote_task(rq, p, dsq, task_rq))) + if (likely(consume_remote_task(rq, p, enq_flags, dsq, task_rq))) return true; goto retry; } @@ -1942,7 +2443,7 @@ static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) { int node = cpu_to_node(cpu_of(rq)); - return consume_dispatch_q(sch, rq, sch->global_dsqs[node]); + return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq, 0); } /** @@ -1975,15 +2476,15 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, * If dispatching to @rq that @p is already on, no lock dancing needed. */ if (rq == src_rq && rq == dst_rq) { - dispatch_enqueue(sch, dst_dsq, p, + dispatch_enqueue(sch, rq, dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; } if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dispatch_enqueue(sch, find_global_dsq(sch, p), p, - enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p, + enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK); return; } @@ -2020,7 +2521,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, */ if (src_rq == dst_rq) { p->scx.holding_cpu = -1; - dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p, + dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p, enq_flags); } else { move_remote_task_to_local_dsq(p, enq_flags, @@ -2090,6 +2591,12 @@ retry: if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) return; + /* see SCX_EV_INSERT_NOT_OWNED definition */ + if (unlikely(!scx_task_on_sched(sch, p))) { + __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); + return; + } + /* * While we know @p is accessible, we don't yet have a claim on * it - the BPF scheduler is allowed to dispatch tasks @@ -2114,17 +2621,17 @@ retry: BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); - dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p); + dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, task_cpu(p)); if (dsq->id == SCX_DSQ_LOCAL) dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); else - dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); } static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) { - struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; u32 u; for (u = 0; u < dspc->cursor; u++) { @@ -2151,67 +2658,54 @@ static inline void maybe_queue_balance_callback(struct rq *rq) rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; } -static int balance_one(struct rq *rq, struct task_struct *prev) +/* + * One user of this function is scx_bpf_dispatch() which can be called + * recursively as sub-sched dispatches nest. Always inline to reduce stack usage + * from the call frame. + */ +static __always_inline bool +scx_dispatch_sched(struct scx_sched *sch, struct rq *rq, + struct task_struct *prev, bool nested) { - struct scx_sched *sch = scx_root; - struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); - bool prev_on_scx = prev->sched_class == &ext_sched_class; - bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED; + struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; int nr_loops = SCX_DSP_MAX_LOOPS; + s32 cpu = cpu_of(rq); + bool prev_on_sch = (prev->sched_class == &ext_sched_class) && + scx_task_on_sched(sch, prev); - lockdep_assert_rq_held(rq); - rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~SCX_RQ_BAL_KEEP; - - if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && - unlikely(rq->scx.cpu_released)) { - /* - * If the previous sched_class for the current CPU was not SCX, - * notify the BPF scheduler that it again has control of the - * core. This callback complements ->cpu_release(), which is - * emitted in switch_class(). - */ - if (SCX_HAS_OP(sch, cpu_acquire)) - SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq, - cpu_of(rq), NULL); - rq->scx.cpu_released = false; - } + if (consume_global_dsq(sch, rq)) + return true; - if (prev_on_scx) { - update_curr_scx(rq); + if (bypass_dsp_enabled(sch)) { + /* if @sch is bypassing, only the bypass DSQs are active */ + if (scx_bypassing(sch, cpu)) + return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); +#ifdef CONFIG_EXT_SUB_SCHED /* - * If @prev is runnable & has slice left, it has priority and - * fetching more just increases latency for the fetched tasks. - * Tell pick_task_scx() to keep running @prev. If the BPF - * scheduler wants to handle this explicitly, it should - * implement ->cpu_release(). + * If @sch isn't bypassing but its children are, @sch is + * responsible for making forward progress for both its own + * tasks that aren't bypassing and the bypassing descendants' + * tasks. The following implements a simple built-in behavior - + * let each CPU try to run the bypass DSQ every Nth time. * - * See scx_disable_workfn() for the explanation on the bypassing - * test. + * Later, if necessary, we can add an ops flag to suppress the + * auto-consumption and a kfunc to consume the bypass DSQ and, + * so that the BPF scheduler can fully control scheduling of + * bypassed tasks. */ - if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) { - rq->scx.flags |= SCX_RQ_BAL_KEEP; - goto has_tasks; - } - } + struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); - /* if there already are tasks to run, nothing to do */ - if (rq->scx.local_dsq.nr) - goto has_tasks; - - if (consume_global_dsq(sch, rq)) - goto has_tasks; - - if (scx_rq_bypassing(rq)) { - if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq)) - goto has_tasks; - else - goto no_tasks; + if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) && + consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0)) { + __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1); + return true; + } +#endif /* CONFIG_EXT_SUB_SCHED */ } if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) - goto no_tasks; + return false; dspc->rq = rq; @@ -2225,19 +2719,25 @@ static int balance_one(struct rq *rq, struct task_struct *prev) do { dspc->nr_tasks = 0; - SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, - cpu_of(rq), prev_on_scx ? prev : NULL); + if (nested) { + SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); + } else { + /* stash @prev so that nested invocations can access it */ + rq->scx.sub_dispatch_prev = prev; + SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); + rq->scx.sub_dispatch_prev = NULL; + } flush_dispatch_buf(sch, rq); - if (prev_on_rq && prev->scx.slice) { + if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) { rq->scx.flags |= SCX_RQ_BAL_KEEP; - goto has_tasks; + return true; } if (rq->scx.local_dsq.nr) - goto has_tasks; + return true; if (consume_global_dsq(sch, rq)) - goto has_tasks; + return true; /* * ops.dispatch() can trap us in this loop by repeatedly @@ -2246,21 +2746,80 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * balance(), we want to complete this scheduling cycle and then * start a new one. IOW, we want to call resched_curr() on the * next, most likely idle, task, not the current one. Use - * scx_kick_cpu() for deferred kicking. + * __scx_bpf_kick_cpu() for deferred kicking. */ if (unlikely(!--nr_loops)) { - scx_kick_cpu(sch, cpu_of(rq), 0); + scx_kick_cpu(sch, cpu, 0); break; } } while (dspc->nr_tasks); -no_tasks: + /* + * Prevent the CPU from going idle while bypassed descendants have tasks + * queued. Without this fallback, bypassed tasks could stall if the host + * scheduler's ops.dispatch() doesn't yield any tasks. + */ + if (bypass_dsp_enabled(sch)) + return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); + + return false; +} + +static int balance_one(struct rq *rq, struct task_struct *prev) +{ + struct scx_sched *sch = scx_root; + s32 cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + rq->scx.flags |= SCX_RQ_IN_BALANCE; + rq->scx.flags &= ~SCX_RQ_BAL_KEEP; + + if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && + unlikely(rq->scx.cpu_released)) { + /* + * If the previous sched_class for the current CPU was not SCX, + * notify the BPF scheduler that it again has control of the + * core. This callback complements ->cpu_release(), which is + * emitted in switch_class(). + */ + if (SCX_HAS_OP(sch, cpu_acquire)) + SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL); + rq->scx.cpu_released = false; + } + + if (prev->sched_class == &ext_sched_class) { + update_curr_scx(rq); + + /* + * If @prev is runnable & has slice left, it has priority and + * fetching more just increases latency for the fetched tasks. + * Tell pick_task_scx() to keep running @prev. If the BPF + * scheduler wants to handle this explicitly, it should + * implement ->cpu_release(). + * + * See scx_disable_workfn() for the explanation on the bypassing + * test. + */ + if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice && + !scx_bypassing(sch, cpu)) { + rq->scx.flags |= SCX_RQ_BAL_KEEP; + goto has_tasks; + } + } + + /* if there already are tasks to run, nothing to do */ + if (rq->scx.local_dsq.nr) + goto has_tasks; + + if (scx_dispatch_sched(sch, rq, prev, false)) + goto has_tasks; + /* * Didn't find another task to run. Keep running @prev unless * %SCX_OPS_ENQ_LAST is in effect. */ - if (prev_on_rq && - (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) { + if ((prev->scx.flags & SCX_TASK_QUEUED) && + (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); goto has_tasks; @@ -2269,40 +2828,26 @@ no_tasks: return false; has_tasks: - rq->scx.flags &= ~SCX_RQ_IN_BALANCE; - return true; -} - -static void process_ddsp_deferred_locals(struct rq *rq) -{ - struct task_struct *p; - - lockdep_assert_rq_held(rq); - /* - * Now that @rq can be unlocked, execute the deferred enqueueing of - * tasks directly dispatched to the local DSQs of other CPUs. See - * direct_dispatch(). Keep popping from the head instead of using - * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq - * temporarily. + * @rq may have extra IMMED tasks without reenq scheduled: + * + * - rq_is_open() can't reliably tell when and how slice is going to be + * modified for $curr and allows IMMED tasks to be queued while + * dispatch is in progress. + * + * - A non-IMMED HEAD task can get queued in front of an IMMED task + * between the IMMED queueing and the subsequent scheduling event. */ - while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, - struct task_struct, scx.dsq_list.node))) { - struct scx_sched *sch = scx_root; - struct scx_dispatch_q *dsq; + if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed)) + schedule_reenq_local(rq, 0); - list_del_init(&p->scx.dsq_list.node); - - dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); - if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) - dispatch_to_local_dsq(sch, rq, dsq, p, - p->scx.ddsp_enq_flags); - } + rq->scx.flags &= ~SCX_RQ_IN_BALANCE; + return true; } static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); if (p->scx.flags & SCX_TASK_QUEUED) { /* @@ -2317,7 +2862,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) /* see dequeue_task_scx() on why we skip when !QUEUED */ if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p); + SCX_CALL_OP_TASK(sch, running, rq, p); clr_task_runnable(p, true); @@ -2389,8 +2934,7 @@ static void switch_class(struct rq *rq, struct task_struct *next) .task = next, }; - SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq, - cpu_of(rq), &args); + SCX_CALL_OP(sch, cpu_release, rq, cpu_of(rq), &args); } rq->scx.cpu_released = true; } @@ -2399,16 +2943,16 @@ static void switch_class(struct rq *rq, struct task_struct *next) static void put_prev_task_scx(struct rq *rq, struct task_struct *p, struct task_struct *next) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); - /* see kick_cpus_irq_workfn() */ + /* see kick_sync_wait_bal_cb() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); update_curr_scx(rq); /* see dequeue_task_scx() on why we skip when !QUEUED */ if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true); + SCX_CALL_OP_TASK(sch, stopping, rq, p, true); if (p->scx.flags & SCX_TASK_QUEUED) { set_task_runnable(rq, p); @@ -2417,11 +2961,17 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * If @p has slice left and is being put, @p is getting * preempted by a higher priority scheduler class or core-sched * forcing a different task. Leave it at the head of the local - * DSQ. + * DSQ unless it was an IMMED task. IMMED tasks should not + * linger on a busy CPU, reenqueue them to the BPF scheduler. */ - if (p->scx.slice && !scx_rq_bypassing(rq)) { - dispatch_enqueue(sch, &rq->scx.local_dsq, p, - SCX_ENQ_HEAD); + if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) { + if (p->scx.flags & SCX_TASK_IMMED) { + p->scx.flags |= SCX_TASK_REENQ_PREEMPTED; + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + } else { + dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD); + } goto switch_class; } @@ -2444,6 +2994,48 @@ switch_class: switch_class(rq, next); } +static void kick_sync_wait_bal_cb(struct rq *rq) +{ + struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); + unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; + bool waited; + s32 cpu; + + /* + * Drop rq lock and enable IRQs while waiting. IRQs must be enabled + * — a target CPU may be waiting for us to process an IPI (e.g. TLB + * flush) while we wait for its kick_sync to advance. + * + * Also, keep advancing our own kick_sync so that new kick_sync waits + * targeting us, which can start after we drop the lock, cannot form + * cyclic dependencies. + */ +retry: + waited = false; + for_each_cpu(cpu, rq->scx.cpus_to_sync) { + /* + * smp_load_acquire() pairs with smp_store_release() on + * kick_sync updates on the target CPUs. + */ + if (cpu == cpu_of(rq) || + smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { + cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); + continue; + } + + raw_spin_rq_unlock_irq(rq); + while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + cpu_relax(); + } + raw_spin_rq_lock_irq(rq); + waited = true; + } + + if (waited) + goto retry; +} + static struct task_struct *first_local_task(struct rq *rq) { return list_first_entry_or_null(&rq->scx.local_dsq.list, @@ -2457,10 +3049,10 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) bool keep_prev; struct task_struct *p; - /* see kick_cpus_irq_workfn() */ + /* see kick_sync_wait_bal_cb() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); - rq->next_class = &ext_sched_class; + rq_modified_begin(rq, &ext_sched_class); rq_unpin_lock(rq, rf); balance_one(rq, prev); @@ -2468,6 +3060,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) maybe_queue_balance_callback(rq); /* + * Defer to a balance callback which can drop rq lock and enable + * IRQs. Waiting directly in the pick path would deadlock against + * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. + */ + if (unlikely(rq->scx.kick_sync_pending)) { + rq->scx.kick_sync_pending = false; + queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, + kick_sync_wait_bal_cb); + } + + /* * If any higher-priority sched class enqueued a runnable task on * this rq during balance_one(), abort and return RETRY_TASK, so * that the scheduler loop can restart. @@ -2475,7 +3078,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) * If @force_scx is true, always try to pick a SCHED_EXT task, * regardless of any higher-priority sched classes activity. */ - if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class)) + if (!force_scx && rq_modified_above(rq, &ext_sched_class)) return RETRY_TASK; keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; @@ -2493,16 +3096,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) if (keep_prev) { p = prev; if (!p->scx.slice) - refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); + refill_task_slice_dfl(scx_task_sched(p), p); } else { p = first_local_task(rq); if (!p) return NULL; if (unlikely(!p->scx.slice)) { - struct scx_sched *sch = rcu_dereference_sched(scx_root); + struct scx_sched *sch = scx_task_sched(p); - if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { + if (!scx_bypassing(sch, cpu_of(rq)) && + !sch->warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", p->comm, p->pid, __func__); sch->warned_zero_slice = true; @@ -2568,16 +3172,17 @@ void ext_server_init(struct rq *rq) bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch_a = scx_task_sched(a); + struct scx_sched *sch_b = scx_task_sched(b); /* * The const qualifiers are dropped from task_struct pointers when * calling ops.core_sched_before(). Accesses are controlled by the * verifier. */ - if (SCX_HAS_OP(sch, core_sched_before) && - !scx_rq_bypassing(task_rq(a))) - return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before, + if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && + !scx_bypassing(sch_a, task_cpu(a))) + return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before, NULL, (struct task_struct *)a, (struct task_struct *)b); @@ -2588,8 +3193,8 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { - struct scx_sched *sch = scx_root; - bool rq_bypass; + struct scx_sched *sch = scx_task_sched(p); + bool bypassing; /* * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it @@ -2604,8 +3209,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - rq_bypass = scx_rq_bypassing(task_rq(p)); - if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) { + bypassing = scx_bypassing(sch, task_cpu(p)); + if (likely(SCX_HAS_OP(sch, select_cpu)) && !bypassing) { s32 cpu; struct task_struct **ddsp_taskp; @@ -2613,10 +3218,9 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - cpu = SCX_CALL_OP_TASK_RET(sch, - SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, - select_cpu, NULL, p, prev_cpu, - wake_flags); + this_rq()->scx.in_select_cpu = true; + cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, prev_cpu, wake_flags); + this_rq()->scx.in_select_cpu = false; p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) @@ -2635,7 +3239,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag } p->scx.selected_cpu = cpu; - if (rq_bypass) + if (bypassing) __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); return cpu; } @@ -2649,7 +3253,7 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_scx(struct task_struct *p, struct affinity_context *ac) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); set_cpus_allowed_common(p, ac); @@ -2665,14 +3269,13 @@ static void set_cpus_allowed_scx(struct task_struct *p, * designation pointless. Cast it away when calling the operation. */ if (SCX_HAS_OP(sch, set_cpumask)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL, - p, (struct cpumask *)p->cpus_ptr); + SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr); } static void handle_hotplug(struct rq *rq, bool online) { struct scx_sched *sch = scx_root; - int cpu = cpu_of(rq); + s32 cpu = cpu_of(rq); atomic_long_inc(&scx_hotplug_seq); @@ -2688,9 +3291,9 @@ static void handle_hotplug(struct rq *rq, bool online) scx_idle_update_selcpu_topology(&sch->ops); if (online && SCX_HAS_OP(sch, cpu_online)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu); + SCX_CALL_OP(sch, cpu_online, NULL, cpu); else if (!online && SCX_HAS_OP(sch, cpu_offline)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu); + SCX_CALL_OP(sch, cpu_offline, NULL, cpu); else scx_exit(sch, SCX_EXIT_UNREG_KERN, SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, @@ -2718,7 +3321,6 @@ static void rq_offline_scx(struct rq *rq) rq->scx.flags &= ~SCX_RQ_ONLINE; } - static bool check_rq_for_timeouts(struct rq *rq) { struct scx_sched *sch; @@ -2732,10 +3334,11 @@ static bool check_rq_for_timeouts(struct rq *rq) goto out_unlock; list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { + struct scx_sched *sch = scx_task_sched(p); unsigned long last_runnable = p->scx.runnable_at; if (unlikely(time_after(jiffies, - last_runnable + scx_watchdog_timeout))) { + last_runnable + READ_ONCE(sch->watchdog_timeout)))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, @@ -2752,6 +3355,7 @@ out_unlock: static void scx_watchdog_workfn(struct work_struct *work) { + unsigned long intv; int cpu; WRITE_ONCE(scx_watchdog_timestamp, jiffies); @@ -2762,28 +3366,30 @@ static void scx_watchdog_workfn(struct work_struct *work) cond_resched(); } - queue_delayed_work(system_unbound_wq, to_delayed_work(work), - scx_watchdog_timeout / 2); + + intv = READ_ONCE(scx_watchdog_interval); + if (intv < ULONG_MAX) + queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv); } void scx_tick(struct rq *rq) { - struct scx_sched *sch; + struct scx_sched *root; unsigned long last_check; if (!scx_enabled()) return; - sch = rcu_dereference_bh(scx_root); - if (unlikely(!sch)) + root = rcu_dereference_bh(scx_root); + if (unlikely(!root)) return; last_check = READ_ONCE(scx_watchdog_timestamp); if (unlikely(time_after(jiffies, - last_check + READ_ONCE(scx_watchdog_timeout)))) { + last_check + READ_ONCE(root->watchdog_timeout)))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_check); - scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, + scx_exit(root, SCX_EXIT_ERROR_STALL, 0, "watchdog failed to check in for %u.%03us", dur_ms / 1000, dur_ms % 1000); } @@ -2793,7 +3399,7 @@ void scx_tick(struct rq *rq) static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(curr); update_curr_scx(rq); @@ -2801,11 +3407,11 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) * While disabling, always resched and refresh core-sched timestamp as * we can't trust the slice management or ops.core_sched_before(). */ - if (scx_rq_bypassing(rq)) { + if (scx_bypassing(sch, cpu_of(rq))) { curr->scx.slice = 0; touch_core_sched(rq, curr); } else if (SCX_HAS_OP(sch, tick)) { - SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr); + SCX_CALL_OP_TASK(sch, tick, rq, curr); } if (!curr->scx.slice) @@ -2834,18 +3440,16 @@ static struct cgroup *tg_cgrp(struct task_group *tg) #endif /* CONFIG_EXT_GROUP_SCHED */ -static enum scx_task_state scx_get_task_state(const struct task_struct *p) +static u32 scx_get_task_state(const struct task_struct *p) { - return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; + return p->scx.flags & SCX_TASK_STATE_MASK; } -static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) +static void scx_set_task_state(struct task_struct *p, u32 state) { - enum scx_task_state prev_state = scx_get_task_state(p); + u32 prev_state = scx_get_task_state(p); bool warn = false; - BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); - switch (state) { case SCX_TASK_NONE: break; @@ -2859,42 +3463,45 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) warn = prev_state != SCX_TASK_READY; break; default: - warn = true; + WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", + prev_state, state, p->comm, p->pid); return; } - WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", + WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", prev_state, state, p->comm, p->pid); p->scx.flags &= ~SCX_TASK_STATE_MASK; - p->scx.flags |= state << SCX_TASK_STATE_SHIFT; + p->scx.flags |= state; } -static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork) +static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) { - struct scx_sched *sch = scx_root; int ret; p->scx.disallow = false; if (SCX_HAS_OP(sch, init_task)) { struct scx_init_task_args args = { - SCX_INIT_TASK_ARGS_CGROUP(tg) + SCX_INIT_TASK_ARGS_CGROUP(task_group(p)) .fork = fork, }; - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL, - p, &args); + ret = SCX_CALL_OP_RET(sch, init_task, NULL, p, &args); if (unlikely(ret)) { ret = ops_sanitize_err(sch, "init_task", ret); return ret; } } - scx_set_task_state(p, SCX_TASK_INIT); - if (p->scx.disallow) { - if (!fork) { + if (unlikely(scx_parent(sch))) { + scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]", + p->comm, p->pid); + } else if (unlikely(fork)) { + scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", + p->comm, p->pid); + } else { struct rq *rq; struct rq_flags rf; @@ -2913,25 +3520,43 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork } task_rq_unlock(rq, p, &rf); - } else if (p->policy == SCHED_EXT) { - scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", - p->comm, p->pid); } } - p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; return 0; } -static void scx_enable_task(struct task_struct *p) +static int scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) +{ + int ret; + + ret = __scx_init_task(sch, p, fork); + if (!ret) { + /* + * While @p's rq is not locked. @p is not visible to the rest of + * SCX yet and it's safe to update the flags and state. + */ + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; + scx_set_task_state(p, SCX_TASK_INIT); + } + return ret; +} + +static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) { - struct scx_sched *sch = scx_root; struct rq *rq = task_rq(p); u32 weight; lockdep_assert_rq_held(rq); /* + * Verify the task is not in BPF scheduler's custody. If flag + * transitions are consistent, the flag should always be clear + * here. + */ + WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); + + /* * Set the weight before calling ops.enable() so that the scheduler * doesn't see a stale value if they inspect the task struct. */ @@ -2943,34 +3568,47 @@ static void scx_enable_task(struct task_struct *p) p->scx.weight = sched_weight_to_cgroup(weight); if (SCX_HAS_OP(sch, enable)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p); - scx_set_task_state(p, SCX_TASK_ENABLED); + SCX_CALL_OP_TASK(sch, enable, rq, p); if (SCX_HAS_OP(sch, set_weight)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, - p, p->scx.weight); + SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); } -static void scx_disable_task(struct task_struct *p) +static void scx_enable_task(struct scx_sched *sch, struct task_struct *p) +{ + __scx_enable_task(sch, p); + scx_set_task_state(p, SCX_TASK_ENABLED); +} + +static void scx_disable_task(struct scx_sched *sch, struct task_struct *p) { - struct scx_sched *sch = scx_root; struct rq *rq = task_rq(p); lockdep_assert_rq_held(rq); WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); + clear_direct_dispatch(p); + if (SCX_HAS_OP(sch, disable)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p); + SCX_CALL_OP_TASK(sch, disable, rq, p); scx_set_task_state(p, SCX_TASK_READY); + + /* + * Verify the task is not in BPF scheduler's custody. If flag + * transitions are consistent, the flag should always be clear + * here. + */ + WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); } -static void scx_exit_task(struct task_struct *p) +static void __scx_disable_and_exit_task(struct scx_sched *sch, + struct task_struct *p) { - struct scx_sched *sch = scx_root; struct scx_exit_task_args args = { .cancelled = false, }; + lockdep_assert_held(&p->pi_lock); lockdep_assert_rq_held(task_rq(p)); switch (scx_get_task_state(p)) { @@ -2982,7 +3620,7 @@ static void scx_exit_task(struct task_struct *p) case SCX_TASK_READY: break; case SCX_TASK_ENABLED: - scx_disable_task(p); + scx_disable_task(sch, p); break; default: WARN_ON_ONCE(true); @@ -2990,8 +3628,26 @@ static void scx_exit_task(struct task_struct *p) } if (SCX_HAS_OP(sch, exit_task)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p), - p, &args); + SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); +} + +static void scx_disable_and_exit_task(struct scx_sched *sch, + struct task_struct *p) +{ + __scx_disable_and_exit_task(sch, p); + + /* + * If set, @p exited between __scx_init_task() and scx_enable_task() in + * scx_sub_enable() and is initialized for both the associated sched and + * its parent. Disable and exit for the child too. + */ + if ((p->scx.flags & SCX_TASK_SUB_INIT) && + !WARN_ON_ONCE(!scx_enabling_sub_sched)) { + __scx_disable_and_exit_task(scx_enabling_sub_sched, p); + p->scx.flags &= ~SCX_TASK_SUB_INIT; + } + + scx_set_task_sched(p, NULL); scx_set_task_state(p, SCX_TASK_NONE); } @@ -3005,7 +3661,7 @@ void init_scx_entity(struct sched_ext_entity *scx) INIT_LIST_HEAD(&scx->runnable_node); scx->runnable_at = jiffies; scx->ddsp_dsq_id = SCX_DSQ_INVALID; - scx->slice = READ_ONCE(scx_slice_dfl); + scx->slice = SCX_SLICE_DFL; } void scx_pre_fork(struct task_struct *p) @@ -3019,14 +3675,25 @@ void scx_pre_fork(struct task_struct *p) percpu_down_read(&scx_fork_rwsem); } -int scx_fork(struct task_struct *p) +int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) { + s32 ret; + percpu_rwsem_assert_held(&scx_fork_rwsem); - if (scx_init_task_enabled) - return scx_init_task(p, task_group(p), true); - else - return 0; + if (scx_init_task_enabled) { +#ifdef CONFIG_EXT_SUB_SCHED + struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; +#else + struct scx_sched *sch = scx_root; +#endif + ret = scx_init_task(sch, p, true); + if (!ret) + scx_set_task_sched(p, sch); + return ret; + } + + return 0; } void scx_post_fork(struct task_struct *p) @@ -3044,7 +3711,7 @@ void scx_post_fork(struct task_struct *p) struct rq *rq; rq = task_rq_lock(p, &rf); - scx_enable_task(p); + scx_enable_task(scx_task_sched(p), p); task_rq_unlock(rq, p, &rf); } } @@ -3064,7 +3731,7 @@ void scx_cancel_fork(struct task_struct *p) rq = task_rq_lock(p, &rf); WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); - scx_exit_task(p); + scx_disable_and_exit_task(scx_task_sched(p), p); task_rq_unlock(rq, p, &rf); } @@ -3115,15 +3782,15 @@ void sched_ext_dead(struct task_struct *p) raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); /* - * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED - * transitions can't race us. Disable ops for @p. + * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> + * ENABLED transitions can't race us. Disable ops for @p. */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - scx_exit_task(p); + scx_disable_and_exit_task(scx_task_sched(p), p); task_rq_unlock(rq, p, &rf); } } @@ -3131,7 +3798,7 @@ void sched_ext_dead(struct task_struct *p) static void reweight_task_scx(struct rq *rq, struct task_struct *p, const struct load_weight *lw) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); lockdep_assert_rq_held(task_rq(p)); @@ -3140,8 +3807,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); if (SCX_HAS_OP(sch, set_weight)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, - p, p->scx.weight); + SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); } static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) @@ -3150,20 +3816,19 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) static void switching_to_scx(struct rq *rq, struct task_struct *p) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); if (task_dead_and_done(p)) return; - scx_enable_task(p); + scx_enable_task(sch, p); /* * set_cpus_allowed_scx() is not called while @p is associated with a * different scheduler class. Keep the BPF scheduler up-to-date. */ if (SCX_HAS_OP(sch, set_cpumask)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq, - p, (struct cpumask *)p->cpus_ptr); + SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr); } static void switched_from_scx(struct rq *rq, struct task_struct *p) @@ -3171,11 +3836,9 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) if (task_dead_and_done(p)) return; - scx_disable_task(p); + scx_disable_task(scx_task_sched(p), p); } -static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {} - static void switched_to_scx(struct rq *rq, struct task_struct *p) {} int scx_check_setscheduler(struct task_struct *p, int policy) @@ -3190,17 +3853,327 @@ int scx_check_setscheduler(struct task_struct *p, int policy) return 0; } +static void process_ddsp_deferred_locals(struct rq *rq) +{ + struct task_struct *p; + + lockdep_assert_rq_held(rq); + + /* + * Now that @rq can be unlocked, execute the deferred enqueueing of + * tasks directly dispatched to the local DSQs of other CPUs. See + * direct_dispatch(). Keep popping from the head instead of using + * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq + * temporarily. + */ + while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, + struct task_struct, scx.dsq_list.node))) { + struct scx_sched *sch = scx_task_sched(p); + struct scx_dispatch_q *dsq; + u64 dsq_id = p->scx.ddsp_dsq_id; + u64 enq_flags = p->scx.ddsp_enq_flags; + + list_del_init(&p->scx.dsq_list.node); + clear_direct_dispatch(p); + + dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p)); + if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) + dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); + } +} + +/* + * Determine whether @p should be reenqueued from a local DSQ. + * + * @reenq_flags is mutable and accumulates state across the DSQ walk: + * + * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First" + * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at + * the head consumes the first slot. + * + * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if + * rq_is_open() is true. + * + * An IMMED task is kept (returns %false) only if it's the first task in the DSQ + * AND the current task is done — i.e. it will execute immediately. All other + * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head, + * every IMMED task behind it gets reenqueued. + * + * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ | + * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local + * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers + * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT + * in process_deferred_reenq_locals(). + */ +static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason) +{ + bool first; + + first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST); + *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST; + + *reason = SCX_TASK_REENQ_KFUNC; + + if ((p->scx.flags & SCX_TASK_IMMED) && + (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) { + __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1); + *reason = SCX_TASK_REENQ_IMMED; + return true; + } + + return *reenq_flags & SCX_REENQ_ANY; +} + +static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) +{ + LIST_HEAD(tasks); + u32 nr_enqueued = 0; + struct task_struct *p, *n; + + lockdep_assert_rq_held(rq); + + if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK)) + reenq_flags &= ~__SCX_REENQ_TSR_MASK; + if (rq_is_open(rq, 0)) + reenq_flags |= SCX_REENQ_TSR_RQ_OPEN; + + /* + * The BPF scheduler may choose to dispatch tasks back to + * @rq->scx.local_dsq. Move all candidate tasks off to a private list + * first to avoid processing the same tasks repeatedly. + */ + list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, + scx.dsq_list.node) { + struct scx_sched *task_sch = scx_task_sched(p); + u32 reason; + + /* + * If @p is being migrated, @p's current CPU may not agree with + * its allowed CPUs and the migration_cpu_stop is about to + * deactivate and re-activate @p anyway. Skip re-enqueueing. + * + * While racing sched property changes may also dequeue and + * re-enqueue a migrating task while its current CPU and allowed + * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to + * the current local DSQ for running tasks and thus are not + * visible to the BPF scheduler. + */ + if (p->migration_pending) + continue; + + if (!scx_is_descendant(task_sch, sch)) + continue; + + if (!local_task_should_reenq(p, &reenq_flags, &reason)) + continue; + + dispatch_dequeue(rq, p); + + if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + p->scx.flags |= reason; + + list_add_tail(&p->scx.dsq_list.node, &tasks); + } + + list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { + list_del_init(&p->scx.dsq_list.node); + + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + nr_enqueued++; + } + + return nr_enqueued; +} + +static void process_deferred_reenq_locals(struct rq *rq) +{ + u64 seq = ++rq->scx.deferred_reenq_locals_seq; + + lockdep_assert_rq_held(rq); + + while (true) { + struct scx_sched *sch; + u64 reenq_flags; + bool skip = false; + + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { + struct scx_deferred_reenq_local *drl = + list_first_entry_or_null(&rq->scx.deferred_reenq_locals, + struct scx_deferred_reenq_local, + node); + struct scx_sched_pcpu *sch_pcpu; + + if (!drl) + return; + + sch_pcpu = container_of(drl, struct scx_sched_pcpu, + deferred_reenq_local); + sch = sch_pcpu->sch; + + reenq_flags = drl->flags; + WRITE_ONCE(drl->flags, 0); + list_del_init(&drl->node); + + if (likely(drl->seq != seq)) { + drl->seq = seq; + drl->cnt = 0; + } else { + if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) { + scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times", + drl->cnt); + skip = true; + } + + __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1); + } + } + + if (!skip) { + /* see schedule_dsq_reenq() */ + smp_mb(); + + reenq_local(sch, rq, reenq_flags); + } + } +} + +static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason) +{ + *reason = SCX_TASK_REENQ_KFUNC; + return reenq_flags & SCX_REENQ_ANY; +} + +static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) +{ + struct rq *locked_rq = rq; + struct scx_sched *sch = dsq->sched; + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); + struct task_struct *p; + s32 nr_enqueued = 0; + + lockdep_assert_rq_held(rq); + + raw_spin_lock(&dsq->lock); + + while (likely(!READ_ONCE(sch->bypass_depth))) { + struct rq *task_rq; + u32 reason; + + p = nldsq_cursor_next_task(&cursor, dsq); + if (!p) + break; + + if (!user_task_should_reenq(p, reenq_flags, &reason)) + continue; + + task_rq = task_rq(p); + + if (locked_rq != task_rq) { + if (locked_rq) + raw_spin_rq_unlock(locked_rq); + if (unlikely(!raw_spin_rq_trylock(task_rq))) { + raw_spin_unlock(&dsq->lock); + raw_spin_rq_lock(task_rq); + raw_spin_lock(&dsq->lock); + } + locked_rq = task_rq; + + /* did we lose @p while switching locks? */ + if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) + continue; + } + + /* @p is on @dsq, its rq and @dsq are locked */ + dispatch_dequeue_locked(p, dsq); + raw_spin_unlock(&dsq->lock); + + if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + p->scx.flags |= reason; + + do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); + + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + + if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { + raw_spin_rq_unlock(locked_rq); + locked_rq = NULL; + cpu_relax(); + } + + raw_spin_lock(&dsq->lock); + } + + list_del_init(&cursor.node); + raw_spin_unlock(&dsq->lock); + + if (locked_rq != rq) { + if (locked_rq) + raw_spin_rq_unlock(locked_rq); + raw_spin_rq_lock(rq); + } +} + +static void process_deferred_reenq_users(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + while (true) { + struct scx_dispatch_q *dsq; + u64 reenq_flags; + + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { + struct scx_deferred_reenq_user *dru = + list_first_entry_or_null(&rq->scx.deferred_reenq_users, + struct scx_deferred_reenq_user, + node); + struct scx_dsq_pcpu *dsq_pcpu; + + if (!dru) + return; + + dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, + deferred_reenq_user); + dsq = dsq_pcpu->dsq; + reenq_flags = dru->flags; + WRITE_ONCE(dru->flags, 0); + list_del_init(&dru->node); + } + + /* see schedule_dsq_reenq() */ + smp_mb(); + + BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); + reenq_user(rq, dsq, reenq_flags); + } +} + +static void run_deferred(struct rq *rq) +{ + process_ddsp_deferred_locals(rq); + + if (!list_empty(&rq->scx.deferred_reenq_locals)) + process_deferred_reenq_locals(rq); + + if (!list_empty(&rq->scx.deferred_reenq_users)) + process_deferred_reenq_users(rq); +} + #ifdef CONFIG_NO_HZ_FULL bool scx_can_stop_tick(struct rq *rq) { struct task_struct *p = rq->curr; - - if (scx_rq_bypassing(rq)) - return false; + struct scx_sched *sch = scx_task_sched(p); if (p->sched_class != &ext_sched_class) return true; + if (scx_bypassing(sch, cpu_of(rq))) + return false; + /* * @rq can dispatch from different DSQs, so we can't tell whether it * needs the tick or not by looking at nr_running. Allow stopping ticks @@ -3238,7 +4211,7 @@ int scx_tg_online(struct task_group *tg) .bw_quota_us = tg->scx.bw_quota_us, .bw_burst_us = tg->scx.bw_burst_us }; - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, + ret = SCX_CALL_OP_RET(sch, cgroup_init, NULL, tg->css.cgroup, &args); if (ret) ret = ops_sanitize_err(sch, "cgroup_init", ret); @@ -3260,8 +4233,7 @@ void scx_tg_offline(struct task_group *tg) if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && (tg->scx.flags & SCX_TG_INITED)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, - tg->css.cgroup); + SCX_CALL_OP(sch, cgroup_exit, NULL, tg->css.cgroup); tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); } @@ -3290,8 +4262,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) continue; if (SCX_HAS_OP(sch, cgroup_prep_move)) { - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, - cgroup_prep_move, NULL, + ret = SCX_CALL_OP_RET(sch, cgroup_prep_move, NULL, p, from, css->cgroup); if (ret) goto err; @@ -3306,7 +4277,7 @@ err: cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(sch, cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + SCX_CALL_OP(sch, cgroup_cancel_move, NULL, p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } @@ -3327,7 +4298,7 @@ void scx_cgroup_move_task(struct task_struct *p) */ if (SCX_HAS_OP(sch, cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) - SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL, + SCX_CALL_OP_TASK(sch, cgroup_move, task_rq(p), p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); p->scx.cgrp_moving_from = NULL; @@ -3345,7 +4316,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(sch, cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + SCX_CALL_OP(sch, cgroup_cancel_move, NULL, p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } @@ -3359,8 +4330,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && tg->scx.weight != weight) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, - tg_cgrp(tg), weight); + SCX_CALL_OP(sch, cgroup_set_weight, NULL, tg_cgrp(tg), weight); tg->scx.weight = weight; @@ -3374,8 +4344,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle) percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL, - tg_cgrp(tg), idle); + SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle); /* Update the task group's idle state */ tg->scx.idle = idle; @@ -3394,7 +4363,7 @@ void scx_group_set_bandwidth(struct task_group *tg, (tg->scx.bw_period_us != period_us || tg->scx.bw_quota_us != quota_us || tg->scx.bw_burst_us != burst_us)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL, + SCX_CALL_OP(sch, cgroup_set_bandwidth, NULL, tg_cgrp(tg), period_us, quota_us, burst_us); tg->scx.bw_period_us = period_us; @@ -3403,33 +4372,55 @@ void scx_group_set_bandwidth(struct task_group *tg, percpu_up_read(&scx_cgroup_ops_rwsem); } +#endif /* CONFIG_EXT_GROUP_SCHED */ + +#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +static struct cgroup *root_cgroup(void) +{ + return &cgrp_dfl_root.cgrp; +} + +static struct cgroup *sch_cgroup(struct scx_sched *sch) +{ + return sch->cgrp; +} + +/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) +{ + struct cgroup *pos; + struct cgroup_subsys_state *css; + + cgroup_for_each_live_descendant_pre(pos, css, cgrp) + rcu_assign_pointer(pos->scx_sched, sch); +} static void scx_cgroup_lock(void) { +#ifdef CONFIG_EXT_GROUP_SCHED percpu_down_write(&scx_cgroup_ops_rwsem); +#endif cgroup_lock(); } static void scx_cgroup_unlock(void) { cgroup_unlock(); +#ifdef CONFIG_EXT_GROUP_SCHED percpu_up_write(&scx_cgroup_ops_rwsem); +#endif } - -#else /* CONFIG_EXT_GROUP_SCHED */ - +#else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ +static struct cgroup *root_cgroup(void) { return NULL; } +static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} static void scx_cgroup_lock(void) {} static void scx_cgroup_unlock(void) {} - -#endif /* CONFIG_EXT_GROUP_SCHED */ +#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ /* * Omitted operations: * - * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task - * isn't tied to the CPU at that point. Preemption is implemented by resetting - * the victim task's slice to 0 and triggering reschedule on the target CPU. - * * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. * * - task_fork/dead: We need fork/dead notifications for all tasks regardless of @@ -3470,13 +4461,60 @@ DEFINE_SCHED_CLASS(ext) = { #endif }; -static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) +static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, + struct scx_sched *sch) { + s32 cpu; + memset(dsq, 0, sizeof(*dsq)); raw_spin_lock_init(&dsq->lock); INIT_LIST_HEAD(&dsq->list); dsq->id = dsq_id; + dsq->sched = sch; + + dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu); + if (!dsq->pcpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); + + pcpu->dsq = dsq; + INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); + } + + return 0; +} + +static void exit_dsq(struct scx_dispatch_q *dsq) +{ + s32 cpu; + + for_each_possible_cpu(cpu) { + struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); + struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; + struct rq *rq = cpu_rq(cpu); + + /* + * There must have been a RCU grace period since the last + * insertion and @dsq should be off the deferred list by now. + */ + if (WARN_ON_ONCE(!list_empty(&dru->node))) { + guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); + list_del_init(&dru->node); + } + } + + free_percpu(dsq->pcpu); +} + +static void free_dsq_rcufn(struct rcu_head *rcu) +{ + struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu); + + exit_dsq(dsq); + kfree(dsq); } static void free_dsq_irq_workfn(struct irq_work *irq_work) @@ -3485,7 +4523,7 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work) struct scx_dispatch_q *dsq, *tmp_dsq; llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) - kfree_rcu(dsq, rcu); + call_rcu(&dsq->rcu, free_dsq_rcufn); } static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); @@ -3550,8 +4588,7 @@ static void scx_cgroup_exit(struct scx_sched *sch) if (!sch->ops.cgroup_exit) continue; - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, - css->cgroup); + SCX_CALL_OP(sch, cgroup_exit, NULL, css->cgroup); } } @@ -3582,10 +4619,9 @@ static int scx_cgroup_init(struct scx_sched *sch) continue; } - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, + ret = SCX_CALL_OP_RET(sch, cgroup_init, NULL, css->cgroup, &args); if (ret) { - css_put(css); scx_error(sch, "ops.cgroup_init() failed (%d)", ret); return ret; } @@ -3662,6 +4698,7 @@ static const struct attribute_group scx_global_attr_group = { .attrs = scx_global_attrs, }; +static void free_pnode(struct scx_sched_pnode *pnode); static void free_exit_info(struct scx_exit_info *ei); static void scx_sched_free_rcu_work(struct work_struct *work) @@ -3670,22 +4707,42 @@ static void scx_sched_free_rcu_work(struct work_struct *work) struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); struct rhashtable_iter rht_iter; struct scx_dispatch_q *dsq; - int node; + int cpu, node; - irq_work_sync(&sch->error_irq_work); + irq_work_sync(&sch->disable_irq_work); kthread_destroy_worker(sch->helper); + timer_shutdown_sync(&sch->bypass_lb_timer); + +#ifdef CONFIG_EXT_SUB_SCHED + kfree(sch->cgrp_path); + if (sch_cgroup(sch)) + cgroup_put(sch_cgroup(sch)); +#endif /* CONFIG_EXT_SUB_SCHED */ + + for_each_possible_cpu(cpu) { + struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); + + /* + * $sch would have entered bypass mode before the RCU grace + * period. As that blocks new deferrals, all + * deferred_reenq_local_node's must be off-list by now. + */ + WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node)); + + exit_dsq(bypass_dsq(sch, cpu)); + } free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) - kfree(sch->global_dsqs[node]); - kfree(sch->global_dsqs); + free_pnode(sch->pnode[node]); + kfree(sch->pnode); rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); do { rhashtable_walk_start(&rht_iter); - while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) + while (!IS_ERR_OR_NULL((dsq = rhashtable_walk_next(&rht_iter)))) destroy_dsq(sch, dsq->id); rhashtable_walk_stop(&rht_iter); @@ -3702,13 +4759,15 @@ static void scx_kobj_release(struct kobject *kobj) struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); - queue_rcu_work(system_unbound_wq, &sch->rcu_work); + queue_rcu_work(system_dfl_wq, &sch->rcu_work); } static ssize_t scx_attr_ops_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return sysfs_emit(buf, "%s\n", scx_root->ops.name); + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); + + return sysfs_emit(buf, "%s\n", sch->ops.name); } SCX_ATTR(ops); @@ -3729,10 +4788,14 @@ static ssize_t scx_attr_events_show(struct kobject *kobj, at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT); at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); + at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH); return at; } SCX_ATTR(events); @@ -3752,7 +4815,19 @@ static const struct kobj_type scx_ktype = { static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { - return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name); + const struct scx_sched *sch; + + /* + * scx_uevent() can be reached by both scx_sched kobjects (scx_ktype) + * and sub-scheduler kset kobjects (kset_ktype) through the parent + * chain walk. Filter out the latter to avoid invalid casts. + */ + if (kobj->ktype != &scx_ktype) + return 0; + + sch = container_of(kobj, struct scx_sched, kobj); + + return add_uevent_var(env, "SCXOPS=%s", sch->ops.name); } static const struct kset_uevent_ops scx_uevent_ops = { @@ -3779,7 +4854,7 @@ bool scx_allow_ttwu_queue(const struct task_struct *p) if (!scx_enabled()) return true; - sch = rcu_dereference_sched(scx_root); + sch = scx_task_sched(p); if (unlikely(!sch)) return true; @@ -3872,7 +4947,7 @@ void scx_softlockup(u32 dur_s) * a good state before taking more drastic actions. * * Returns %true if sched_ext is enabled and abort was initiated, which may - * resolve the reported hardlockdup. %false if sched_ext is not enabled or + * resolve the reported hardlockup. %false if sched_ext is not enabled or * someone else already initiated abort. */ bool scx_hardlockup(int cpu) @@ -3885,13 +4960,14 @@ bool scx_hardlockup(int cpu) return true; } -static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, +static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor, struct cpumask *donee_mask, struct cpumask *resched_mask, u32 nr_donor_target, u32 nr_donee_target) { - struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; + struct rq *donor_rq = cpu_rq(donor); + struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor); struct task_struct *p, *n; - struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0); + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0); s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; u32 nr_balanced = 0, min_delta_us; @@ -3901,11 +4977,11 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, * consider offloading iff the total queued duration is over the * threshold. */ - min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV; - if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us)) + min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV; + if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us))) return 0; - raw_spin_rq_lock_irq(rq); + raw_spin_rq_lock_irq(donor_rq); raw_spin_lock(&donor_dsq->lock); list_add(&cursor.node, &donor_dsq->list); resume: @@ -3913,7 +4989,6 @@ resume: n = nldsq_next_task(donor_dsq, n, false); while ((p = n)) { - struct rq *donee_rq; struct scx_dispatch_q *donee_dsq; int donee; @@ -3929,14 +5004,13 @@ resume: if (donee >= nr_cpu_ids) continue; - donee_rq = cpu_rq(donee); - donee_dsq = &donee_rq->scx.bypass_dsq; + donee_dsq = bypass_dsq(sch, donee); /* * $p's rq is not locked but $p's DSQ lock protects its * scheduling properties making this test safe. */ - if (!task_can_run_on_remote_rq(sch, p, donee_rq, false)) + if (!task_can_run_on_remote_rq(sch, p, cpu_rq(donee), false)) continue; /* @@ -3951,7 +5025,7 @@ resume: * between bypass DSQs. */ dispatch_dequeue_locked(p, donor_dsq); - dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED); + dispatch_enqueue(sch, cpu_rq(donee), donee_dsq, p, SCX_ENQ_NESTED); /* * $donee might have been idle and need to be woken up. No need @@ -3966,9 +5040,9 @@ resume: if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { list_move_tail(&cursor.node, &n->scx.dsq_list.node); raw_spin_unlock(&donor_dsq->lock); - raw_spin_rq_unlock_irq(rq); + raw_spin_rq_unlock_irq(donor_rq); cpu_relax(); - raw_spin_rq_lock_irq(rq); + raw_spin_rq_lock_irq(donor_rq); raw_spin_lock(&donor_dsq->lock); goto resume; } @@ -3976,7 +5050,7 @@ resume: list_del_init(&cursor.node); raw_spin_unlock(&donor_dsq->lock); - raw_spin_rq_unlock_irq(rq); + raw_spin_rq_unlock_irq(donor_rq); return nr_balanced; } @@ -3994,7 +5068,7 @@ static void bypass_lb_node(struct scx_sched *sch, int node) /* count the target tasks and CPUs */ for_each_cpu_and(cpu, cpu_online_mask, node_mask) { - u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); nr_tasks += nr; nr_cpus++; @@ -4016,24 +5090,21 @@ static void bypass_lb_node(struct scx_sched *sch, int node) cpumask_clear(donee_mask); for_each_cpu_and(cpu, cpu_online_mask, node_mask) { - if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target) + if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target) cpumask_set_cpu(cpu, donee_mask); } /* iterate !donee CPUs and see if they should be offloaded */ cpumask_clear(resched_mask); for_each_cpu_and(cpu, cpu_online_mask, node_mask) { - struct rq *rq = cpu_rq(cpu); - struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; - if (cpumask_empty(donee_mask)) break; if (cpumask_test_cpu(cpu, donee_mask)) continue; - if (READ_ONCE(donor_dsq->nr) <= nr_donor_target) + if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target) continue; - nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask, + nr_balanced += bypass_lb_cpu(sch, cpu, donee_mask, resched_mask, nr_donor_target, nr_target); } @@ -4041,7 +5112,7 @@ static void bypass_lb_node(struct scx_sched *sch, int node) resched_cpu(cpu); for_each_cpu_and(cpu, cpu_online_mask, node_mask) { - u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); after_min = min(nr, after_min); after_max = max(nr, after_max); @@ -4063,12 +5134,11 @@ static void bypass_lb_node(struct scx_sched *sch, int node) */ static void scx_bypass_lb_timerfn(struct timer_list *timer) { - struct scx_sched *sch; + struct scx_sched *sch = container_of(timer, struct scx_sched, bypass_lb_timer); int node; u32 intv_us; - sch = rcu_dereference_all(scx_root); - if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth)) + if (!bypass_dsp_enabled(sch)) return; for_each_node_with_cpus(node) @@ -4079,10 +5149,102 @@ static void scx_bypass_lb_timerfn(struct timer_list *timer) mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); } -static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn); +static bool inc_bypass_depth(struct scx_sched *sch) +{ + lockdep_assert_held(&scx_bypass_lock); + + WARN_ON_ONCE(sch->bypass_depth < 0); + WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1); + if (sch->bypass_depth != 1) + return false; + + WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC); + sch->bypass_timestamp = ktime_get_ns(); + scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); + return true; +} + +static bool dec_bypass_depth(struct scx_sched *sch) +{ + lockdep_assert_held(&scx_bypass_lock); + + WARN_ON_ONCE(sch->bypass_depth < 1); + WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1); + if (sch->bypass_depth != 0) + return false; + + WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL); + scx_add_event(sch, SCX_EV_BYPASS_DURATION, + ktime_get_ns() - sch->bypass_timestamp); + return true; +} + +static void enable_bypass_dsp(struct scx_sched *sch) +{ + struct scx_sched *host = scx_parent(sch) ?: sch; + u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us); + s32 ret; + + /* + * @sch->bypass_depth transitioning from 0 to 1 triggers enabling. + * Shouldn't stagger. + */ + if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim))) + return; + + /* + * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of + * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is + * called iff @sch is not already bypassed due to an ancestor bypassing, + * we can assume that the parent is not bypassing and thus will be the + * host of the bypass DSQs. + * + * While the situation may change in the future, the following + * guarantees that the nearest non-bypassing ancestor or root has bypass + * dispatch enabled while a descendant is bypassing, which is all that's + * required. + * + * bypass_dsp_enabled() test is used to determine whether to enter the + * bypass dispatch handling path from both bypassing and hosting scheds. + * Bump enable depth on both @sch and bypass dispatch host. + */ + ret = atomic_inc_return(&sch->bypass_dsp_enable_depth); + WARN_ON_ONCE(ret <= 0); + + if (host != sch) { + ret = atomic_inc_return(&host->bypass_dsp_enable_depth); + WARN_ON_ONCE(ret <= 0); + } + + /* + * The LB timer will stop running if bypass dispatch is disabled. Start + * after enabling bypass dispatch. + */ + if (intv_us && !timer_pending(&host->bypass_lb_timer)) + mod_timer(&host->bypass_lb_timer, + jiffies + usecs_to_jiffies(intv_us)); +} + +/* may be called without holding scx_bypass_lock */ +static void disable_bypass_dsp(struct scx_sched *sch) +{ + s32 ret; + + if (!test_and_clear_bit(0, &sch->bypass_dsp_claim)) + return; + + ret = atomic_dec_return(&sch->bypass_dsp_enable_depth); + WARN_ON_ONCE(ret < 0); + + if (scx_parent(sch)) { + ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth); + WARN_ON_ONCE(ret < 0); + } +} /** * scx_bypass - [Un]bypass scx_ops and guarantee forward progress + * @sch: sched to bypass * @bypass: true for bypass, false for unbypass * * Bypassing guarantees that all runnable tasks make forward progress without @@ -4112,49 +5274,42 @@ static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn); * * - scx_prio_less() reverts to the default core_sched_at order. */ -static void scx_bypass(bool bypass) +static void scx_bypass(struct scx_sched *sch, bool bypass) { - static DEFINE_RAW_SPINLOCK(bypass_lock); - static unsigned long bypass_timestamp; - struct scx_sched *sch; + struct scx_sched *pos; unsigned long flags; int cpu; - raw_spin_lock_irqsave(&bypass_lock, flags); - sch = rcu_dereference_bh(scx_root); + raw_spin_lock_irqsave(&scx_bypass_lock, flags); if (bypass) { - u32 intv_us; - - WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1); - WARN_ON_ONCE(scx_bypass_depth <= 0); - if (scx_bypass_depth != 1) + if (!inc_bypass_depth(sch)) goto unlock; - WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC); - bypass_timestamp = ktime_get_ns(); - if (sch) - scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); - - intv_us = READ_ONCE(scx_bypass_lb_intv_us); - if (intv_us && !timer_pending(&scx_bypass_lb_timer)) { - scx_bypass_lb_timer.expires = - jiffies + usecs_to_jiffies(intv_us); - add_timer_global(&scx_bypass_lb_timer); - } + + enable_bypass_dsp(sch); } else { - WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1); - WARN_ON_ONCE(scx_bypass_depth < 0); - if (scx_bypass_depth != 0) + if (!dec_bypass_depth(sch)) goto unlock; - WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL); - if (sch) - scx_add_event(sch, SCX_EV_BYPASS_DURATION, - ktime_get_ns() - bypass_timestamp); } /* + * Bypass state is propagated to all descendants - an scx_sched bypasses + * if itself or any of its ancestors are in bypass mode. + */ + raw_spin_lock(&scx_sched_lock); + scx_for_each_descendant_pre(pos, sch) { + if (pos == sch) + continue; + if (bypass) + inc_bypass_depth(pos); + else + dec_bypass_depth(pos); + } + raw_spin_unlock(&scx_sched_lock); + + /* * No task property is changing. We just need to make sure all currently - * queued tasks are re-queued according to the new scx_rq_bypassing() + * queued tasks are re-queued according to the new scx_bypassing() * state. As an optimization, walk each rq's runnable_list instead of * the scx_tasks list. * @@ -4166,19 +5321,23 @@ static void scx_bypass(bool bypass) struct task_struct *p, *n; raw_spin_rq_lock(rq); + raw_spin_lock(&scx_sched_lock); - if (bypass) { - WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); - rq->scx.flags |= SCX_RQ_BYPASSING; - } else { - WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING)); - rq->scx.flags &= ~SCX_RQ_BYPASSING; + scx_for_each_descendant_pre(pos, sch) { + struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu); + + if (pos->bypass_depth) + pcpu->flags |= SCX_SCHED_PCPU_BYPASSING; + else + pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING; } + raw_spin_unlock(&scx_sched_lock); + /* * We need to guarantee that no tasks are on the BPF scheduler * while bypassing. Either we see enabled or the enable path - * sees scx_rq_bypassing() before moving tasks to SCX. + * sees scx_bypassing() before moving tasks to SCX. */ if (!scx_enabled()) { raw_spin_rq_unlock(rq); @@ -4194,6 +5353,9 @@ static void scx_bypass(bool bypass) */ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, scx.runnable_node) { + if (!scx_is_descendant(scx_task_sched(p), sch)) + continue; + /* cycling deq/enq is enough, see the function comment */ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { /* nothing */ ; @@ -4207,8 +5369,11 @@ static void scx_bypass(bool bypass) raw_spin_rq_unlock(rq); } + /* disarming must come after moving all tasks out of the bypass DSQs */ + if (!bypass) + disable_bypass_dsp(sch); unlock: - raw_spin_unlock_irqrestore(&bypass_lock, flags); + raw_spin_unlock_irqrestore(&scx_bypass_lock, flags); } static void free_exit_info(struct scx_exit_info *ei) @@ -4250,6 +5415,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) return "unregistered from the main kernel"; case SCX_EXIT_SYSRQ: return "disabled by sysrq-S"; + case SCX_EXIT_PARENT: + return "parent exiting"; case SCX_EXIT_ERROR: return "runtime error"; case SCX_EXIT_ERROR_BPF: @@ -4275,28 +5442,279 @@ static void free_kick_syncs(void) } } -static void scx_disable_workfn(struct kthread_work *work) +static void refresh_watchdog(void) { - struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); - struct scx_exit_info *ei = sch->exit_info; + struct scx_sched *sch; + unsigned long intv = ULONG_MAX; + + /* take the shortest timeout and use its half for watchdog interval */ + rcu_read_lock(); + list_for_each_entry_rcu(sch, &scx_sched_all, all) + intv = max(min(intv, sch->watchdog_timeout / 2), 1); + rcu_read_unlock(); + + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + WRITE_ONCE(scx_watchdog_interval, intv); + + if (intv < ULONG_MAX) + mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv); + else + cancel_delayed_work_sync(&scx_watchdog_work); +} + +static s32 scx_link_sched(struct scx_sched *sch) +{ + scoped_guard(raw_spinlock_irq, &scx_sched_lock) { +#ifdef CONFIG_EXT_SUB_SCHED + struct scx_sched *parent = scx_parent(sch); + s32 ret; + + if (parent) { + /* + * scx_claim_exit() propagates exit_kind transition to + * its sub-scheds while holding scx_sched_lock - either + * we can see the parent's non-NONE exit_kind or the + * parent can shoot us down. + */ + if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { + scx_error(sch, "parent disabled"); + return -ENOENT; + } + + ret = rhashtable_lookup_insert_fast(&scx_sched_hash, + &sch->hash_node, scx_sched_hash_params); + if (ret) { + scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret); + return ret; + } + + list_add_tail(&sch->sibling, &parent->children); + } +#endif /* CONFIG_EXT_SUB_SCHED */ + + list_add_tail_rcu(&sch->all, &scx_sched_all); + } + + refresh_watchdog(); + return 0; +} + +static void scx_unlink_sched(struct scx_sched *sch) +{ + scoped_guard(raw_spinlock_irq, &scx_sched_lock) { +#ifdef CONFIG_EXT_SUB_SCHED + if (scx_parent(sch)) { + rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node, + scx_sched_hash_params); + list_del_init(&sch->sibling); + } +#endif /* CONFIG_EXT_SUB_SCHED */ + list_del_rcu(&sch->all); + } + + refresh_watchdog(); +} + +/* + * Called to disable future dumps and wait for in-progress one while disabling + * @sch. Once @sch becomes empty during disable, there's no point in dumping it. + * This prevents calling dump ops on a dead sch. + */ +static void scx_disable_dump(struct scx_sched *sch) +{ + guard(raw_spinlock_irqsave)(&scx_dump_lock); + sch->dump_disabled = true; +} + +#ifdef CONFIG_EXT_SUB_SCHED +static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); + +static void drain_descendants(struct scx_sched *sch) +{ + /* + * Child scheds that finished the critical part of disabling will take + * themselves off @sch->children. Wait for it to drain. As propagation + * is recursive, empty @sch->children means that all proper descendant + * scheds reached unlinking stage. + */ + wait_event(scx_unlink_waitq, list_empty(&sch->children)); +} + +static void scx_fail_parent(struct scx_sched *sch, + struct task_struct *failed, s32 fail_code) +{ + struct scx_sched *parent = scx_parent(sch); struct scx_task_iter sti; struct task_struct *p; - int kind, cpu; - kind = atomic_read(&sch->exit_kind); - while (true) { - if (kind == SCX_EXIT_DONE) /* already disabled? */ - return; - WARN_ON_ONCE(kind == SCX_EXIT_NONE); - if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) + scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler", + fail_code, failed->comm, failed->pid); + + /* + * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into + * it. This may cause downstream failures on the BPF side but $parent is + * dying anyway. + */ + scx_bypass(parent, true); + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + if (scx_task_on_sched(parent, p)) + continue; + + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + scx_disable_and_exit_task(sch, p); + rcu_assign_pointer(p->scx.sched, parent); + } + } + scx_task_iter_stop(&sti); +} + +static void scx_sub_disable(struct scx_sched *sch) +{ + struct scx_sched *parent = scx_parent(sch); + struct scx_task_iter sti; + struct task_struct *p; + int ret; + + /* + * Guarantee forward progress and wait for descendants to be disabled. + * To limit disruptions, $parent is not bypassed. Tasks are fully + * prepped and then inserted back into $parent. + */ + scx_bypass(sch, true); + drain_descendants(sch); + + /* + * Here, every runnable task is guaranteed to make forward progress and + * we can safely use blocking synchronization constructs. Actually + * disable ops. + */ + mutex_lock(&scx_enable_mutex); + percpu_down_write(&scx_fork_rwsem); + scx_cgroup_lock(); + + set_cgroup_sched(sch_cgroup(sch), parent); + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + struct rq *rq; + struct rq_flags rf; + + /* filter out duplicate visits */ + if (scx_task_on_sched(parent, p)) + continue; + + /* + * By the time control reaches here, all descendant schedulers + * should already have been disabled. + */ + WARN_ON_ONCE(!scx_task_on_sched(sch, p)); + + /* + * If $p is about to be freed, nothing prevents $sch from + * unloading before $p reaches sched_ext_free(). Disable and + * exit $p right away. + */ + if (!tryget_task_struct(p)) { + scx_disable_and_exit_task(sch, p); + continue; + } + + scx_task_iter_unlock(&sti); + + /* + * $p is READY or ENABLED on @sch. Initialize for $parent, + * disable and exit from @sch, and then switch over to $parent. + * + * If a task fails to initialize for $parent, the only available + * action is disabling $parent too. While this allows disabling + * of a child sched to cause the parent scheduler to fail, the + * failure can only originate from ops.init_task() of the + * parent. A child can't directly affect the parent through its + * own failures. + */ + ret = __scx_init_task(parent, p, false); + if (ret) { + scx_fail_parent(sch, p, ret); + put_task_struct(p); break; + } + + rq = task_rq_lock(p, &rf); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* + * $p is initialized for $parent and still attached to + * @sch. Disable and exit for @sch, switch over to + * $parent, override the state to READY to account for + * $p having already been initialized, and then enable. + */ + scx_disable_and_exit_task(sch, p); + scx_set_task_state(p, SCX_TASK_INIT); + rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_state(p, SCX_TASK_READY); + scx_enable_task(parent, p); + } + task_rq_unlock(rq, p, &rf); + + put_task_struct(p); } - ei->kind = kind; - ei->reason = scx_exit_reason(ei->kind); + scx_task_iter_stop(&sti); + + scx_disable_dump(sch); - /* guarantee forward progress by bypassing scx_ops */ - scx_bypass(true); - WRITE_ONCE(scx_aborting, false); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + /* + * All tasks are moved off of @sch but there may still be on-going + * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use + * the expedited version as ancestors may be waiting in bypass mode. + * Also, tell the parent that there is no need to keep running bypass + * DSQs for us. + */ + synchronize_rcu_expedited(); + disable_bypass_dsp(sch); + + scx_unlink_sched(sch); + + mutex_unlock(&scx_enable_mutex); + + /* + * @sch is now unlinked from the parent's children list. Notify and call + * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called + * after unlinking and releasing all locks. See scx_claim_exit(). + */ + wake_up_all(&scx_unlink_waitq); + + if (parent->ops.sub_detach && sch->sub_attached) { + struct scx_sub_detach_args sub_detach_args = { + .ops = &sch->ops, + .cgroup_path = sch->cgrp_path, + }; + SCX_CALL_OP(parent, sub_detach, NULL, + &sub_detach_args); + } + + if (sch->ops.exit) + SCX_CALL_OP(sch, exit, NULL, sch->exit_info); + kobject_del(&sch->kobj); +} +#else /* CONFIG_EXT_SUB_SCHED */ +static void drain_descendants(struct scx_sched *sch) { } +static void scx_sub_disable(struct scx_sched *sch) { } +#endif /* CONFIG_EXT_SUB_SCHED */ + +static void scx_root_disable(struct scx_sched *sch) +{ + struct scx_exit_info *ei = sch->exit_info; + struct scx_task_iter sti; + struct task_struct *p; + int cpu; + + /* guarantee forward progress and wait for descendants to be disabled */ + scx_bypass(sch, true); + drain_descendants(sch); switch (scx_set_enable_state(SCX_DISABLING)) { case SCX_DISABLING: @@ -4323,7 +5741,7 @@ static void scx_disable_workfn(struct kthread_work *work) /* * Shut down cgroup support before tasks so that the cgroup attach path - * doesn't race against scx_exit_task(). + * doesn't race against scx_disable_and_exit_task(). */ scx_cgroup_lock(); scx_cgroup_exit(sch); @@ -4337,7 +5755,7 @@ static void scx_disable_workfn(struct kthread_work *work) scx_init_task_enabled = false; - scx_task_iter_start(&sti); + scx_task_iter_start(&sti, NULL); while ((p = scx_task_iter_next_locked(&sti))) { unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *old_class = p->sched_class; @@ -4352,9 +5770,16 @@ static void scx_disable_workfn(struct kthread_work *work) p->sched_class = new_class; } - scx_exit_task(p); + scx_disable_and_exit_task(scx_task_sched(p), p); } scx_task_iter_stop(&sti); + + scx_disable_dump(sch); + + scx_cgroup_lock(); + set_cgroup_sched(sch_cgroup(sch), NULL); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); /* @@ -4387,9 +5812,9 @@ static void scx_disable_workfn(struct kthread_work *work) } if (sch->ops.exit) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei); + SCX_CALL_OP(sch, exit, NULL, ei); - cancel_delayed_work_sync(&scx_watchdog_work); + scx_unlink_sched(sch); /* * scx_root clearing must be inside cpus_read_lock(). See @@ -4406,27 +5831,31 @@ static void scx_disable_workfn(struct kthread_work *work) */ kobject_del(&sch->kobj); - free_percpu(scx_dsp_ctx); - scx_dsp_ctx = NULL; - scx_dsp_max_batch = 0; free_kick_syncs(); - if (scx_bypassed_for_enable) { - scx_bypassed_for_enable = false; - scx_bypass(false); - } - mutex_unlock(&scx_enable_mutex); WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); done: - scx_bypass(false); + scx_bypass(sch, false); } +/* + * Claim the exit on @sch. The caller must ensure that the helper kthread work + * is kicked before the current task can be preempted. Once exit_kind is + * claimed, scx_error() can no longer trigger, so if the current task gets + * preempted and the BPF scheduler fails to schedule it back, the helper work + * will never be kicked and the whole system can wedge. + */ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) { int none = SCX_EXIT_NONE; + lockdep_assert_preemption_disabled(); + + if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) + kind = SCX_EXIT_ERROR; + if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) return false; @@ -4435,24 +5864,61 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) * flag to break potential live-lock scenarios, ensuring we can * successfully reach scx_bypass(). */ - WRITE_ONCE(scx_aborting, true); + WRITE_ONCE(sch->aborting, true); + + /* + * Propagate exits to descendants immediately. Each has a dedicated + * helper kthread and can run in parallel. While most of disabling is + * serialized, running them in separate threads allows parallelizing + * ops.exit(), which can take arbitrarily long prolonging bypass mode. + * + * To guarantee forward progress, this propagation must be in-line so + * that ->aborting is synchronously asserted for all sub-scheds. The + * propagation is also the interlocking point against sub-sched + * attachment. See scx_link_sched(). + * + * This doesn't cause recursions as propagation only takes place for + * non-propagation exits. + */ + if (kind != SCX_EXIT_PARENT) { + scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) { + struct scx_sched *pos; + scx_for_each_descendant_pre(pos, sch) + scx_disable(pos, SCX_EXIT_PARENT); + } + } + return true; } -static void scx_disable(enum scx_exit_kind kind) +static void scx_disable_workfn(struct kthread_work *work) { - struct scx_sched *sch; - - if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) - kind = SCX_EXIT_ERROR; + struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); + struct scx_exit_info *ei = sch->exit_info; + int kind; - rcu_read_lock(); - sch = rcu_dereference(scx_root); - if (sch) { - scx_claim_exit(sch, kind); - kthread_queue_work(sch->helper, &sch->disable_work); + kind = atomic_read(&sch->exit_kind); + while (true) { + if (kind == SCX_EXIT_DONE) /* already disabled? */ + return; + WARN_ON_ONCE(kind == SCX_EXIT_NONE); + if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) + break; } - rcu_read_unlock(); + ei->kind = kind; + ei->reason = scx_exit_reason(ei->kind); + + if (scx_parent(sch)) + scx_sub_disable(sch); + else + scx_root_disable(sch); +} + +static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) +{ + guard(preempt)(); + if (scx_claim_exit(sch, kind)) + irq_work_queue(&sch->disable_irq_work); } static void dump_newline(struct seq_buf *s) @@ -4470,14 +5936,14 @@ static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) #ifdef CONFIG_TRACEPOINTS if (trace_sched_ext_dump_enabled()) { - /* protected by scx_dump_state()::dump_lock */ + /* protected by scx_dump_lock */ static char line_buf[SCX_EXIT_MSG_LEN]; va_start(args, fmt); vscnprintf(line_buf, sizeof(line_buf), fmt, args); va_end(args); - trace_sched_ext_dump(line_buf); + trace_call__sched_ext_dump(line_buf); } #endif /* @s may be zero sized and seq_buf triggers WARN if so */ @@ -4566,25 +6032,38 @@ static void ops_dump_exit(void) scx_dump_data.cpu = -1; } -static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, +static void scx_dump_task(struct scx_sched *sch, + struct seq_buf *s, struct scx_dump_ctx *dctx, struct task_struct *p, char marker) { static unsigned long bt[SCX_EXIT_BT_LEN]; - struct scx_sched *sch = scx_root; + struct scx_sched *task_sch = scx_task_sched(p); + const char *own_marker; + char sch_id_buf[32]; char dsq_id_buf[19] = "(n/a)"; unsigned long ops_state = atomic_long_read(&p->scx.ops_state); unsigned int bt_len = 0; + own_marker = task_sch == sch ? "*" : ""; + + if (task_sch->level == 0) + scnprintf(sch_id_buf, sizeof(sch_id_buf), "root"); + else + scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu", + task_sch->level, task_sch->ops.sub_cgroup_id); + if (p->scx.dsq) scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", (unsigned long long)p->scx.dsq->id); dump_newline(s); - dump_line(s, " %c%c %s[%d] %+ldms", + dump_line(s, " %c%c %s[%d] %s%s %+ldms", marker, task_state_to_char(p), p->comm, p->pid, + own_marker, sch_id_buf, jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", - scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, + scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT, + p->scx.flags & ~SCX_TASK_STATE_MASK, p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", @@ -4596,7 +6075,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); - SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p); + SCX_CALL_OP(sch, dump_task, NULL, dctx, p); ops_dump_exit(); } @@ -4609,11 +6088,17 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, } } -static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) +/* + * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless + * of which scheduler they belong to. If false, only dump tasks owned by @sch. + * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped + * separately. For error dumps, @dump_all_tasks=true since only the failing + * scheduler is dumped. + */ +static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, + size_t dump_len, bool dump_all_tasks) { - static DEFINE_SPINLOCK(dump_lock); static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; - struct scx_sched *sch = scx_root; struct scx_dump_ctx dctx = { .kind = ei->kind, .exit_code = ei->exit_code, @@ -4623,14 +6108,24 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) }; struct seq_buf s; struct scx_event_stats events; - unsigned long flags; char *buf; int cpu; - spin_lock_irqsave(&dump_lock, flags); + guard(raw_spinlock_irqsave)(&scx_dump_lock); + + if (sch->dump_disabled) + return; seq_buf_init(&s, ei->dump, dump_len); +#ifdef CONFIG_EXT_SUB_SCHED + if (sch->level == 0) + dump_line(&s, "%s: root", sch->ops.name); + else + dump_line(&s, "%s: sub%d-%llu %s", + sch->ops.name, sch->level, sch->ops.sub_cgroup_id, + sch->cgrp_path); +#endif if (ei->kind == SCX_EXIT_NONE) { dump_line(&s, "Debug dump triggered by %s", ei->reason); } else { @@ -4644,7 +6139,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) if (SCX_HAS_OP(sch, dump)) { ops_dump_init(&s, ""); - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx); + SCX_CALL_OP(sch, dump, NULL, &dctx); ops_dump_exit(); } @@ -4697,11 +6192,14 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) if (!cpumask_empty(rq->scx.cpus_to_wait)) dump_line(&ns, " cpus_to_wait : %*pb", cpumask_pr_args(rq->scx.cpus_to_wait)); + if (!cpumask_empty(rq->scx.cpus_to_sync)) + dump_line(&ns, " cpus_to_sync : %*pb", + cpumask_pr_args(rq->scx.cpus_to_sync)); used = seq_buf_used(&ns); if (SCX_HAS_OP(sch, dump_cpu)) { ops_dump_init(&ns, " "); - SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL, + SCX_CALL_OP(sch, dump_cpu, NULL, &dctx, cpu, idle); ops_dump_exit(); } @@ -4723,11 +6221,13 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) seq_buf_set_overflow(&s); } - if (rq->curr->sched_class == &ext_sched_class) - scx_dump_task(&s, &dctx, rq->curr, '*'); + if (rq->curr->sched_class == &ext_sched_class && + (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) + scx_dump_task(sch, &s, &dctx, rq->curr, '*'); list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) - scx_dump_task(&s, &dctx, p, ' '); + if (dump_all_tasks || scx_task_on_sched(sch, p)) + scx_dump_task(sch, &s, &dctx, p, ' '); next: rq_unlock_irqrestore(rq, &rf); } @@ -4742,25 +6242,27 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_dump_event(s, &events, SCX_EV_REENQ_IMMED); + scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT); scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); + scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED); + scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH); if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), trunc_marker, sizeof(trunc_marker)); - - spin_unlock_irqrestore(&dump_lock, flags); } -static void scx_error_irq_workfn(struct irq_work *irq_work) +static void scx_disable_irq_workfn(struct irq_work *irq_work) { - struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work); + struct scx_sched *sch = container_of(irq_work, struct scx_sched, disable_irq_work); struct scx_exit_info *ei = sch->exit_info; if (ei->kind >= SCX_EXIT_ERROR) - scx_dump_state(ei, sch->ops.exit_dump_len); + scx_dump_state(sch, ei, sch->ops.exit_dump_len, true); kthread_queue_work(sch->helper, &sch->disable_work); } @@ -4771,6 +6273,8 @@ static bool scx_vexit(struct scx_sched *sch, { struct scx_exit_info *ei = sch->exit_info; + guard(preempt)(); + if (!scx_claim_exit(sch, kind)) return false; @@ -4788,7 +6292,7 @@ static bool scx_vexit(struct scx_sched *sch, ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); - irq_work_queue(&sch->error_irq_work); + irq_work_queue(&sch->disable_irq_work); return true; } @@ -4819,14 +6323,47 @@ static int alloc_kick_syncs(void) return 0; } -static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) +static void free_pnode(struct scx_sched_pnode *pnode) +{ + if (!pnode) + return; + exit_dsq(&pnode->global_dsq); + kfree(pnode); +} + +static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) +{ + struct scx_sched_pnode *pnode; + + pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node); + if (!pnode) + return NULL; + + if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) { + kfree(pnode); + return NULL; + } + + return pnode; +} + +/* + * Allocate and initialize a new scx_sched. @cgrp's reference is always + * consumed whether the function succeeds or fails. + */ +static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, + struct cgroup *cgrp, + struct scx_sched *parent) { struct scx_sched *sch; - int node, ret; + s32 level = parent ? parent->level + 1 : 0; + s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; - sch = kzalloc_obj(*sch); - if (!sch) - return ERR_PTR(-ENOMEM); + sch = kzalloc_flex(*sch, ancestors, level + 1); + if (!sch) { + ret = -ENOMEM; + goto err_put_cgrp; + } sch->exit_info = alloc_exit_info(ops->exit_dump_len); if (!sch->exit_info) { @@ -4838,29 +6375,42 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) if (ret < 0) goto err_free_ei; - sch->global_dsqs = kzalloc_objs(sch->global_dsqs[0], nr_node_ids); - if (!sch->global_dsqs) { + sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids); + if (!sch->pnode) { ret = -ENOMEM; goto err_free_hash; } for_each_node_state(node, N_POSSIBLE) { - struct scx_dispatch_q *dsq; - - dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); - if (!dsq) { + sch->pnode[node] = alloc_pnode(sch, node); + if (!sch->pnode[node]) { ret = -ENOMEM; - goto err_free_gdsqs; + goto err_free_pnode; } - - init_dsq(dsq, SCX_DSQ_GLOBAL); - sch->global_dsqs[node] = dsq; } - sch->pcpu = alloc_percpu(struct scx_sched_pcpu); + sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; + sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu, + dsp_ctx.buf, sch->dsp_max_batch), + __alignof__(struct scx_sched_pcpu)); if (!sch->pcpu) { ret = -ENOMEM; - goto err_free_gdsqs; + goto err_free_pnode; + } + + for_each_possible_cpu(cpu) { + ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); + if (ret) { + bypass_fail_cpu = cpu; + goto err_free_pcpu; + } + } + + for_each_possible_cpu(cpu) { + struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); + + pcpu->sch = sch; + INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node); } sch->helper = kthread_run_worker(0, "sched_ext_helper"); @@ -4871,33 +6421,98 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) sched_set_fifo(sch->helper->task); + if (parent) + memcpy(sch->ancestors, parent->ancestors, + level * sizeof(parent->ancestors[0])); + sch->ancestors[level] = sch; + sch->level = level; + + if (ops->timeout_ms) + sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms); + else + sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT; + + sch->slice_dfl = SCX_SLICE_DFL; atomic_set(&sch->exit_kind, SCX_EXIT_NONE); - init_irq_work(&sch->error_irq_work, scx_error_irq_workfn); + init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); + timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); sch->ops = *ops; - ops->priv = sch; + rcu_assign_pointer(ops->priv, sch); sch->kobj.kset = scx_kset; - ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); - if (ret < 0) + +#ifdef CONFIG_EXT_SUB_SCHED + char *buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + ret = -ENOMEM; + goto err_stop_helper; + } + cgroup_path(cgrp, buf, PATH_MAX); + sch->cgrp_path = kstrdup(buf, GFP_KERNEL); + kfree(buf); + if (!sch->cgrp_path) { + ret = -ENOMEM; goto err_stop_helper; + } + + sch->cgrp = cgrp; + INIT_LIST_HEAD(&sch->children); + INIT_LIST_HEAD(&sch->sibling); + + if (parent) + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, + &parent->sub_kset->kobj, + "sub-%llu", cgroup_id(cgrp)); + else + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); + if (ret < 0) { + kobject_put(&sch->kobj); + return ERR_PTR(ret); + } + + if (ops->sub_attach) { + sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); + if (!sch->sub_kset) { + kobject_put(&sch->kobj); + return ERR_PTR(-ENOMEM); + } + } +#else /* CONFIG_EXT_SUB_SCHED */ + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); + if (ret < 0) { + kobject_put(&sch->kobj); + return ERR_PTR(ret); + } +#endif /* CONFIG_EXT_SUB_SCHED */ return sch; +#ifdef CONFIG_EXT_SUB_SCHED err_stop_helper: kthread_destroy_worker(sch->helper); +#endif err_free_pcpu: + for_each_possible_cpu(cpu) { + if (cpu == bypass_fail_cpu) + break; + exit_dsq(bypass_dsq(sch, cpu)); + } free_percpu(sch->pcpu); -err_free_gdsqs: +err_free_pnode: for_each_node_state(node, N_POSSIBLE) - kfree(sch->global_dsqs[node]); - kfree(sch->global_dsqs); + free_pnode(sch->pnode[node]); + kfree(sch->pnode); err_free_hash: rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); err_free_ei: free_exit_info(sch->exit_info); err_free_sch: kfree(sch); +err_put_cgrp: +#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) + cgroup_put(cgrp); +#endif return ERR_PTR(ret); } @@ -4946,29 +6561,35 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) return -EINVAL; } - if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) - pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); - if (ops->cpu_acquire || ops->cpu_release) pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); return 0; } -static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) +/* + * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid + * starvation. During the READY -> ENABLED task switching loop, the calling + * thread's sched_class gets switched from fair to ext. As fair has higher + * priority than ext, the calling thread can be indefinitely starved under + * fair-class saturation, leading to a system hang. + */ +struct scx_enable_cmd { + struct kthread_work work; + struct sched_ext_ops *ops; + int ret; +}; + +static void scx_root_enable_workfn(struct kthread_work *work) { + struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); + struct sched_ext_ops *ops = cmd->ops; + struct cgroup *cgrp = root_cgroup(); struct scx_sched *sch; struct scx_task_iter sti; struct task_struct *p; - unsigned long timeout; int i, cpu, ret; - if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), - cpu_possible_mask)) { - pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); - return -EINVAL; - } - mutex_lock(&scx_enable_mutex); if (scx_enable_state() != SCX_DISABLED) { @@ -4980,7 +6601,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) goto err_unlock; - sch = scx_alloc_and_add_sched(ops); +#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) + cgroup_get(cgrp); +#endif + sch = scx_alloc_and_add_sched(ops, cgrp, NULL); if (IS_ERR(sch)) { ret = PTR_ERR(sch); goto err_free_ksyncs; @@ -4992,13 +6616,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); WARN_ON_ONCE(scx_root); - if (WARN_ON_ONCE(READ_ONCE(scx_aborting))) - WRITE_ONCE(scx_aborting, false); atomic_long_set(&scx_nr_rejected, 0); - for_each_possible_cpu(cpu) - cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + rq->scx.local_dsq.sched = sch; + rq->scx.cpuperf_target = SCX_CPUPERF_ONE; + } /* * Keep CPUs stable during enable so that the BPF scheduler can track @@ -5012,10 +6638,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) */ rcu_assign_pointer(scx_root, sch); + ret = scx_link_sched(sch); + if (ret) + goto err_disable; + scx_idle_enable(ops); if (sch->ops.init) { - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); + ret = SCX_CALL_OP_RET(sch, init, NULL); if (ret) { ret = ops_sanitize_err(sch, "init", ret); cpus_read_unlock(); @@ -5042,34 +6672,13 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (ret) goto err_disable; - WARN_ON_ONCE(scx_dsp_ctx); - scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; - scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf, - scx_dsp_max_batch), - __alignof__(struct scx_dsp_ctx)); - if (!scx_dsp_ctx) { - ret = -ENOMEM; - goto err_disable; - } - - if (ops->timeout_ms) - timeout = msecs_to_jiffies(ops->timeout_ms); - else - timeout = SCX_WATCHDOG_MAX_TIMEOUT; - - WRITE_ONCE(scx_watchdog_timeout, timeout); - WRITE_ONCE(scx_watchdog_timestamp, jiffies); - queue_delayed_work(system_unbound_wq, &scx_watchdog_work, - scx_watchdog_timeout / 2); - /* * Once __scx_enabled is set, %current can be switched to SCX anytime. * This can lead to stalls as some BPF schedulers (e.g. userspace * scheduling) may not function correctly before all tasks are switched. * Init in bypass mode to guarantee forward progress. */ - scx_bypass(true); - scx_bypassed_for_enable = true; + scx_bypass(sch, true); for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) @@ -5101,11 +6710,12 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) * never sees uninitialized tasks. */ scx_cgroup_lock(); + set_cgroup_sched(sch_cgroup(sch), sch); ret = scx_cgroup_init(sch); if (ret) goto err_disable_unlock_all; - scx_task_iter_start(&sti); + scx_task_iter_start(&sti, NULL); while ((p = scx_task_iter_next_locked(&sti))) { /* * @p may already be dead, have lost all its usages counts and @@ -5117,7 +6727,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_task_iter_unlock(&sti); - ret = scx_init_task(p, task_group(p), false); + ret = scx_init_task(sch, p, false); if (ret) { put_task_struct(p); scx_task_iter_stop(&sti); @@ -5126,6 +6736,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) goto err_disable_unlock_all; } + scx_set_task_sched(p, sch); scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); @@ -5147,7 +6758,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) * scx_tasks_lock. */ percpu_down_write(&scx_fork_rwsem); - scx_task_iter_start(&sti); + scx_task_iter_start(&sti, NULL); while ((p = scx_task_iter_next_locked(&sti))) { unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; const struct sched_class *old_class = p->sched_class; @@ -5160,15 +6771,14 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) queue_flags |= DEQUEUE_CLASS; scoped_guard (sched_change, p, queue_flags) { - p->scx.slice = READ_ONCE(scx_slice_dfl); + p->scx.slice = READ_ONCE(sch->slice_dfl); p->sched_class = new_class; } } scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); - scx_bypassed_for_enable = false; - scx_bypass(false); + scx_bypass(sch, false); if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); @@ -5185,13 +6795,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) atomic_long_inc(&scx_enable_seq); - return 0; + cmd->ret = 0; + return; err_free_ksyncs: free_kick_syncs(); err_unlock: mutex_unlock(&scx_enable_mutex); - return ret; + cmd->ret = ret; + return; err_disable_unlock_all: scx_cgroup_unlock(); @@ -5208,9 +6820,355 @@ err_disable: * Flush scx_disable_work to ensure that error is reported before init * completion. sch's base reference will be put by bpf_scx_unreg(). */ - scx_error(sch, "scx_enable() failed (%d)", ret); + scx_error(sch, "scx_root_enable() failed (%d)", ret); kthread_flush_work(&sch->disable_work); - return 0; + cmd->ret = 0; +} + +#ifdef CONFIG_EXT_SUB_SCHED +/* verify that a scheduler can be attached to @cgrp and return the parent */ +static struct scx_sched *find_parent_sched(struct cgroup *cgrp) +{ + struct scx_sched *parent = cgrp->scx_sched; + struct scx_sched *pos; + + lockdep_assert_held(&scx_sched_lock); + + /* can't attach twice to the same cgroup */ + if (parent->cgrp == cgrp) + return ERR_PTR(-EBUSY); + + /* does $parent allow sub-scheds? */ + if (!parent->ops.sub_attach) + return ERR_PTR(-EOPNOTSUPP); + + /* can't insert between $parent and its exiting children */ + list_for_each_entry(pos, &parent->children, sibling) + if (cgroup_is_descendant(pos->cgrp, cgrp)) + return ERR_PTR(-EBUSY); + + return parent; +} + +static bool assert_task_ready_or_enabled(struct task_struct *p) +{ + u32 state = scx_get_task_state(p); + + switch (state) { + case SCX_TASK_READY: + case SCX_TASK_ENABLED: + return true; + default: + WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched", + state, p->comm, p->pid); + return false; + } +} + +static void scx_sub_enable_workfn(struct kthread_work *work) +{ + struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); + struct sched_ext_ops *ops = cmd->ops; + struct cgroup *cgrp; + struct scx_sched *parent, *sch; + struct scx_task_iter sti; + struct task_struct *p; + s32 i, ret; + + mutex_lock(&scx_enable_mutex); + + if (!scx_enabled()) { + ret = -ENODEV; + goto out_unlock; + } + + cgrp = cgroup_get_from_id(ops->sub_cgroup_id); + if (IS_ERR(cgrp)) { + ret = PTR_ERR(cgrp); + goto out_unlock; + } + + raw_spin_lock_irq(&scx_sched_lock); + parent = find_parent_sched(cgrp); + if (IS_ERR(parent)) { + raw_spin_unlock_irq(&scx_sched_lock); + ret = PTR_ERR(parent); + goto out_put_cgrp; + } + kobject_get(&parent->kobj); + raw_spin_unlock_irq(&scx_sched_lock); + + /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */ + sch = scx_alloc_and_add_sched(ops, cgrp, parent); + kobject_put(&parent->kobj); + if (IS_ERR(sch)) { + ret = PTR_ERR(sch); + goto out_unlock; + } + + ret = scx_link_sched(sch); + if (ret) + goto err_disable; + + if (sch->level >= SCX_SUB_MAX_DEPTH) { + scx_error(sch, "max nesting depth %d violated", + SCX_SUB_MAX_DEPTH); + goto err_disable; + } + + if (sch->ops.init) { + ret = SCX_CALL_OP_RET(sch, init, NULL); + if (ret) { + ret = ops_sanitize_err(sch, "init", ret); + scx_error(sch, "ops.init() failed (%d)", ret); + goto err_disable; + } + sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; + } + + if (validate_ops(sch, ops)) + goto err_disable; + + struct scx_sub_attach_args sub_attach_args = { + .ops = &sch->ops, + .cgroup_path = sch->cgrp_path, + }; + + ret = SCX_CALL_OP_RET(parent, sub_attach, NULL, + &sub_attach_args); + if (ret) { + ret = ops_sanitize_err(sch, "sub_attach", ret); + scx_error(sch, "parent rejected (%d)", ret); + goto err_disable; + } + sch->sub_attached = true; + + scx_bypass(sch, true); + + for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) + if (((void (**)(void))ops)[i]) + set_bit(i, sch->has_op); + + percpu_down_write(&scx_fork_rwsem); + scx_cgroup_lock(); + + /* + * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see + * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down. + */ + set_cgroup_sched(sch_cgroup(sch), sch); + if (!(cgrp->self.flags & CSS_ONLINE)) { + scx_error(sch, "cgroup is not online"); + goto err_unlock_and_disable; + } + + /* + * Initialize tasks for the new child $sch without exiting them for + * $parent so that the tasks can always be reverted back to $parent + * sched on child init failure. + */ + WARN_ON_ONCE(scx_enabling_sub_sched); + scx_enabling_sub_sched = sch; + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + struct rq *rq; + struct rq_flags rf; + + /* + * Task iteration may visit the same task twice when racing + * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which + * finished __scx_init_task() and skip if set. + * + * A task may exit and get freed between __scx_init_task() + * completion and scx_enable_task(). In such cases, + * scx_disable_and_exit_task() must exit the task for both the + * parent and child scheds. + */ + if (p->scx.flags & SCX_TASK_SUB_INIT) + continue; + + /* see scx_root_enable() */ + if (!tryget_task_struct(p)) + continue; + + if (!assert_task_ready_or_enabled(p)) { + ret = -EINVAL; + goto abort; + } + + scx_task_iter_unlock(&sti); + + /* + * As $p is still on $parent, it can't be transitioned to INIT. + * Let's worry about task state later. Use __scx_init_task(). + */ + ret = __scx_init_task(sch, p, false); + if (ret) + goto abort; + + rq = task_rq_lock(p, &rf); + p->scx.flags |= SCX_TASK_SUB_INIT; + task_rq_unlock(rq, p, &rf); + + put_task_struct(p); + } + scx_task_iter_stop(&sti); + + /* + * All tasks are prepped. Disable/exit tasks for $parent and enable for + * the new @sch. + */ + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + /* + * Use clearing of %SCX_TASK_SUB_INIT to detect and skip + * duplicate iterations. + */ + if (!(p->scx.flags & SCX_TASK_SUB_INIT)) + continue; + + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* + * $p must be either READY or ENABLED. If ENABLED, + * __scx_disabled_and_exit_task() first disables and + * makes it READY. However, after exiting $p, it will + * leave $p as READY. + */ + assert_task_ready_or_enabled(p); + __scx_disable_and_exit_task(parent, p); + + /* + * $p is now only initialized for @sch and READY, which + * is what we want. Assign it to @sch and enable. + */ + rcu_assign_pointer(p->scx.sched, sch); + scx_enable_task(sch, p); + + p->scx.flags &= ~SCX_TASK_SUB_INIT; + } + } + scx_task_iter_stop(&sti); + + scx_enabling_sub_sched = NULL; + + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + scx_bypass(sch, false); + + pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); + kobject_uevent(&sch->kobj, KOBJ_ADD); + ret = 0; + goto out_unlock; + +out_put_cgrp: + cgroup_put(cgrp); +out_unlock: + mutex_unlock(&scx_enable_mutex); + cmd->ret = ret; + return; + +abort: + put_task_struct(p); + scx_task_iter_stop(&sti); + scx_enabling_sub_sched = NULL; + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + if (p->scx.flags & SCX_TASK_SUB_INIT) { + __scx_disable_and_exit_task(sch, p); + p->scx.flags &= ~SCX_TASK_SUB_INIT; + } + } + scx_task_iter_stop(&sti); +err_unlock_and_disable: + /* we'll soon enter disable path, keep bypass on */ + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); +err_disable: + mutex_unlock(&scx_enable_mutex); + kthread_flush_work(&sch->disable_work); + cmd->ret = 0; +} + +static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct cgroup *cgrp = data; + struct cgroup *parent = cgroup_parent(cgrp); + + if (!cgroup_on_dfl(cgrp)) + return NOTIFY_OK; + + switch (action) { + case CGROUP_LIFETIME_ONLINE: + /* inherit ->scx_sched from $parent */ + if (parent) + rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched); + break; + case CGROUP_LIFETIME_OFFLINE: + /* if there is a sched attached, shoot it down */ + if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp) + scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN, + SCX_ECODE_RSN_CGROUP_OFFLINE, + "cgroup %llu going offline", cgroup_id(cgrp)); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block scx_cgroup_lifetime_nb = { + .notifier_call = scx_cgroup_lifetime_notify, +}; + +static s32 __init scx_cgroup_lifetime_notifier_init(void) +{ + return blocking_notifier_chain_register(&cgroup_lifetime_notifier, + &scx_cgroup_lifetime_nb); +} +core_initcall(scx_cgroup_lifetime_notifier_init); +#endif /* CONFIG_EXT_SUB_SCHED */ + +static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) +{ + static struct kthread_worker *helper; + static DEFINE_MUTEX(helper_mutex); + struct scx_enable_cmd cmd; + + if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), + cpu_possible_mask)) { + pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); + return -EINVAL; + } + + if (!READ_ONCE(helper)) { + mutex_lock(&helper_mutex); + if (!helper) { + struct kthread_worker *w = + kthread_run_worker(0, "scx_enable_helper"); + if (IS_ERR_OR_NULL(w)) { + mutex_unlock(&helper_mutex); + return -ENOMEM; + } + sched_set_fifo(w->task); + WRITE_ONCE(helper, w); + } + mutex_unlock(&helper_mutex); + } + +#ifdef CONFIG_EXT_SUB_SCHED + if (ops->sub_cgroup_id > 1) + kthread_init_work(&cmd.work, scx_sub_enable_workfn); + else +#endif /* CONFIG_EXT_SUB_SCHED */ + kthread_init_work(&cmd.work, scx_root_enable_workfn); + cmd.ops = ops; + + kthread_queue_work(READ_ONCE(helper), &cmd.work); + kthread_flush_work(&cmd.work); + return cmd.ret; } @@ -5246,12 +7204,17 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, t = btf_type_by_id(reg->btf, reg->btf_id); if (t == task_struct_type) { - if (off >= offsetof(struct task_struct, scx.slice) && - off + size <= offsetofend(struct task_struct, scx.slice)) - return SCALAR_VALUE; - if (off >= offsetof(struct task_struct, scx.dsq_vtime) && - off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) + /* + * COMPAT: Will be removed in v6.23. + */ + if ((off >= offsetof(struct task_struct, scx.slice) && + off + size <= offsetofend(struct task_struct, scx.slice)) || + (off >= offsetof(struct task_struct, scx.dsq_vtime) && + off + size <= offsetofend(struct task_struct, scx.dsq_vtime))) { + pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()"); return SCALAR_VALUE; + } + if (off >= offsetof(struct task_struct, scx.disallow) && off + size <= offsetofend(struct task_struct, scx.disallow)) return SCALAR_VALUE; @@ -5307,11 +7270,30 @@ static int bpf_scx_init_member(const struct btf_type *t, case offsetof(struct sched_ext_ops, hotplug_seq): ops->hotplug_seq = *(u64 *)(udata + moff); return 1; +#ifdef CONFIG_EXT_SUB_SCHED + case offsetof(struct sched_ext_ops, sub_cgroup_id): + ops->sub_cgroup_id = *(u64 *)(udata + moff); + return 1; +#endif /* CONFIG_EXT_SUB_SCHED */ } return 0; } +#ifdef CONFIG_EXT_SUB_SCHED +static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog) +{ + struct scx_sched *sch; + + guard(rcu)(); + sch = scx_prog_sched(prog->aux); + if (unlikely(!sch)) + return; + + scx_error(sch, "dispatch recursion detected"); +} +#endif /* CONFIG_EXT_SUB_SCHED */ + static int bpf_scx_check_member(const struct btf_type *t, const struct btf_member *member, const struct bpf_prog *prog) @@ -5329,12 +7311,30 @@ static int bpf_scx_check_member(const struct btf_type *t, case offsetof(struct sched_ext_ops, cpu_offline): case offsetof(struct sched_ext_ops, init): case offsetof(struct sched_ext_ops, exit): + case offsetof(struct sched_ext_ops, sub_attach): + case offsetof(struct sched_ext_ops, sub_detach): break; default: if (prog->sleepable) return -EINVAL; } +#ifdef CONFIG_EXT_SUB_SCHED + /* + * Enable private stack for operations that can nest along the + * hierarchy. + * + * XXX - Ideally, we should only do this for scheds that allow + * sub-scheds and sub-scheds themselves but I don't know how to access + * struct_ops from here. + */ + switch (moff) { + case offsetof(struct sched_ext_ops, dispatch): + prog->aux->priv_stack_requested = true; + prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch; + } +#endif /* CONFIG_EXT_SUB_SCHED */ + return 0; } @@ -5346,10 +7346,11 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link) static void bpf_scx_unreg(void *kdata, struct bpf_link *link) { struct sched_ext_ops *ops = kdata; - struct scx_sched *sch = ops->priv; + struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); - scx_disable(SCX_EXIT_UNREG); + scx_disable(sch, SCX_EXIT_UNREG); kthread_flush_work(&sch->disable_work); + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); } @@ -5406,7 +7407,9 @@ static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgro static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} -#endif +#endif /* CONFIG_EXT_GROUP_SCHED */ +static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; } +static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {} static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} static s32 sched_ext_ops__init(void) { return -EINVAL; } @@ -5446,6 +7449,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, #endif + .sub_attach = sched_ext_ops__sub_attach, + .sub_detach = sched_ext_ops__sub_detach, .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, .init = sched_ext_ops__init, @@ -5476,7 +7481,15 @@ static struct bpf_struct_ops bpf_sched_ext_ops = { static void sysrq_handle_sched_ext_reset(u8 key) { - scx_disable(SCX_EXIT_SYSRQ); + struct scx_sched *sch; + + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (likely(sch)) + scx_disable(sch, SCX_EXIT_SYSRQ); + else + pr_info("sched_ext: BPF schedulers not loaded\n"); + rcu_read_unlock(); } static const struct sysrq_key_op sysrq_sched_ext_reset_op = { @@ -5489,9 +7502,10 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = { static void sysrq_handle_sched_ext_dump(u8 key) { struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; + struct scx_sched *sch; - if (scx_enabled()) - scx_dump_state(&ei, 0); + list_for_each_entry_rcu(sch, &scx_sched_all, all) + scx_dump_state(sch, &ei, 0, false); } static const struct sysrq_key_op sysrq_sched_ext_dump_op = { @@ -5545,11 +7559,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { if (cur_class == &ext_sched_class) { + cpumask_set_cpu(cpu, this_scx->cpus_to_sync); ksyncs[cpu] = rq->scx.kick_sync; should_wait = true; - } else { - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } resched_curr(rq); @@ -5586,10 +7600,9 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) unsigned long *ksyncs; s32 cpu; - if (unlikely(!ksyncs_pcpu)) { - pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs"); + /* can race with free_kick_syncs() during scheduler disable */ + if (unlikely(!ksyncs_pcpu)) return; - } ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; @@ -5604,27 +7617,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); } - if (!should_wait) - return; - - for_each_cpu(cpu, this_scx->cpus_to_wait) { - unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; - - /* - * Busy-wait until the task running at the time of kicking is no - * longer running. This can be used to implement e.g. core - * scheduling. - * - * smp_cond_load_acquire() pairs with store_releases in - * pick_task_scx() and put_prev_task_scx(). The former breaks - * the wait if SCX's scheduling path is entered even if the same - * task is picked subsequently. The latter is necessary to break - * the wait when $cpu is taken by a higher sched class. - */ - if (cpu != cpu_of(this_rq)) - smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); - - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + /* + * Can't wait in hardirq — kick_sync can't advance, deadlocking if + * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). + */ + if (should_wait) { + raw_spin_rq_lock(this_rq); + this_scx->kick_sync_pending = true; + resched_curr(this_rq); + raw_spin_rq_unlock(this_rq); } } @@ -5642,14 +7643,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) */ void print_scx_info(const char *log_lvl, struct task_struct *p) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; enum scx_enable_state state = scx_enable_state(); const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; char runnable_at_buf[22] = "?"; struct sched_class *class; unsigned long runnable_at; - if (state == SCX_DISABLED) + guard(rcu)(); + + sch = scx_task_sched_rcu(p); + + if (!sch) return; /* @@ -5676,6 +7681,14 @@ void print_scx_info(const char *log_lvl, struct task_struct *p) static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (!sch) + return NOTIFY_OK; + /* * SCX schedulers often have userspace components which are sometimes * involved in critial scheduling paths. PM operations involve freezing @@ -5686,12 +7699,12 @@ static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void * case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: case PM_RESTORE_PREPARE: - scx_bypass(true); + scx_bypass(sch, true); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: case PM_POST_RESTORE: - scx_bypass(false); + scx_bypass(sch, false); break; } @@ -5720,8 +7733,9 @@ void __init init_sched_ext_class(void) struct rq *rq = cpu_rq(cpu); int n = cpu_to_node(cpu); - init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); - init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS); + /* local_dsq's sch will be set during scx_root_enable() */ + BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL)); + INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); @@ -5729,6 +7743,10 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); + raw_spin_lock_init(&rq->scx.deferred_reenq_lock); + INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); + INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); @@ -5739,18 +7757,36 @@ void __init init_sched_ext_class(void) register_sysrq_key('S', &sysrq_sched_ext_reset_op); register_sysrq_key('D', &sysrq_sched_ext_dump_op); INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); + +#ifdef CONFIG_EXT_SUB_SCHED + BUG_ON(rhashtable_init(&scx_sched_hash, &scx_sched_hash_params)); +#endif /* CONFIG_EXT_SUB_SCHED */ } /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, - u64 enq_flags) +static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 *enq_flags) { - if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) - return false; + bool is_local = dsq_id == SCX_DSQ_LOCAL || + (dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON; + if (*enq_flags & SCX_ENQ_IMMED) { + if (unlikely(!is_local)) { + scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id); + return false; + } + } else if ((sch->ops.flags & SCX_OPS_ALWAYS_ENQ_IMMED) && is_local) { + *enq_flags |= SCX_ENQ_IMMED; + } + + return true; +} + +static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, + u64 dsq_id, u64 *enq_flags) +{ lockdep_assert_irqs_disabled(); if (unlikely(!p)) { @@ -5758,18 +7794,27 @@ static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p return false; } - if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { - scx_error(sch, "invalid enq_flags 0x%llx", enq_flags); + if (unlikely(*enq_flags & __SCX_ENQ_INTERNAL_MASK)) { + scx_error(sch, "invalid enq_flags 0x%llx", *enq_flags); + return false; + } + + /* see SCX_EV_INSERT_NOT_OWNED definition */ + if (unlikely(!scx_task_on_sched(sch, p))) { + __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); return false; } + if (!scx_vet_enq_flags(sch, dsq_id, enq_flags)) + return false; + return true; } static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, u64 dsq_id, u64 enq_flags) { - struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; struct task_struct *ddsp_task; ddsp_task = __this_cpu_read(direct_dispatch_task); @@ -5778,7 +7823,7 @@ static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, return; } - if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { + if (unlikely(dspc->cursor >= sch->dsp_max_batch)) { scx_error(sch, "dispatch buffer overflow"); return; } @@ -5799,6 +7844,7 @@ __bpf_kfunc_start_defs(); * @dsq_id: DSQ to insert into * @slice: duration @p can run for in nsecs, 0 to keep the current value * @enq_flags: SCX_ENQ_* + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to * call this function spuriously. Can be called from ops.enqueue(), @@ -5833,16 +7879,17 @@ __bpf_kfunc_start_defs(); * to check the return value. */ __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, - u64 slice, u64 enq_flags) + u64 slice, u64 enq_flags, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return false; - if (!scx_dsq_insert_preamble(sch, p, enq_flags)) + if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) return false; if (slice) @@ -5859,15 +7906,16 @@ __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. */ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, - u64 slice, u64 enq_flags) + u64 slice, u64 enq_flags, + const struct bpf_prog_aux *aux) { - scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags); + scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags, aux); } static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) { - if (!scx_dsq_insert_preamble(sch, p, enq_flags)) + if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) return false; if (slice) @@ -5898,6 +7946,7 @@ struct scx_bpf_dsq_insert_vtime_args { * @args->slice: duration @p can run for in nsecs, 0 to keep the current value * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ * @args->enq_flags: SCX_ENQ_* + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided @@ -5922,13 +7971,14 @@ struct scx_bpf_dsq_insert_vtime_args { */ __bpf_kfunc bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, - struct scx_bpf_dsq_insert_vtime_args *args) + struct scx_bpf_dsq_insert_vtime_args *args, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return false; @@ -5950,44 +8000,61 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, if (unlikely(!sch)) return; +#ifdef CONFIG_EXT_SUB_SCHED + /* + * Disallow if any sub-scheds are attached. There is no way to tell + * which scheduler called us, just error out @p's scheduler. + */ + if (unlikely(!list_empty(&sch->children))) { + scx_error(scx_task_sched(p), "__scx_bpf_dsq_insert_vtime() must be used"); + return; + } +#endif + scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) -BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU) -BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_IMPLICIT_ARGS | KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_enqueue_dispatch, + .filter = scx_kfunc_context_filter, }; static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, struct task_struct *p, u64 dsq_id, u64 enq_flags) { - struct scx_sched *sch = scx_root; struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; + struct scx_sched *sch = src_dsq->sched; struct rq *this_rq, *src_rq, *locked_rq; bool dispatched = false; bool in_balance; unsigned long flags; - if (!scx_kf_allowed_if_unlocked() && - !scx_kf_allowed(sch, SCX_KF_DISPATCH)) + if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags)) return false; /* * If the BPF scheduler keeps calling this function repeatedly, it can * cause similar live-lock conditions as consume_dispatch_q(). */ - if (unlikely(READ_ONCE(scx_aborting))) + if (unlikely(READ_ONCE(sch->aborting))) return false; + if (unlikely(!scx_task_on_sched(sch, p))) { + scx_error(sch, "scx_bpf_dsq_move[_vtime]() on %s[%d] but the task belongs to a different scheduler", + p->comm, p->pid); + return false; + } + /* * Can be called from either ops.dispatch() locking this_rq() or any * context where no rq lock is held. If latter, lock @p's task_rq which @@ -6011,20 +8078,14 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, locked_rq = src_rq; raw_spin_lock(&src_dsq->lock); - /* - * Did someone else get to it? @p could have already left $src_dsq, got - * re-enqueud, or be in the process of being consumed by someone else. - */ - if (unlikely(p->scx.dsq != src_dsq || - u32_before(kit->cursor.priv, p->scx.dsq_seq) || - p->scx.holding_cpu >= 0) || - WARN_ON_ONCE(src_rq != task_rq(p))) { + /* did someone else get to it while we dropped the locks? */ + if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) { raw_spin_unlock(&src_dsq->lock); goto out; } /* @p is still on $src_dsq and stable, determine the destination */ - dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p); + dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, task_cpu(p)); /* * Apply vtime and slice updates before moving so that the new time is @@ -6058,44 +8119,42 @@ __bpf_kfunc_start_defs(); /** * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Can only be called from ops.dispatch(). */ -__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) +__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return 0; - if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) - return 0; - - return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); + return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor); } /** * scx_bpf_dispatch_cancel - Cancel the latest dispatch + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Cancel the latest dispatch. Can be called multiple times to cancel further * dispatches. Can only be called from ops.dispatch(). */ -__bpf_kfunc void scx_bpf_dispatch_cancel(void) +__bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux) { - struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_sched *sch; + struct scx_dsp_ctx *dspc; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return; - if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) - return; + dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; if (dspc->cursor > 0) dspc->cursor--; @@ -6105,10 +8164,21 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) /** * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ - * @dsq_id: DSQ to move task from + * @dsq_id: DSQ to move task from. Must be a user-created DSQ + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * @enq_flags: %SCX_ENQ_* * * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's - * local DSQ for execution. Can only be called from ops.dispatch(). + * local DSQ for execution with @enq_flags applied. Can only be called from + * ops.dispatch(). + * + * Built-in DSQs (%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) are not supported as + * sources. Local DSQs support reenqueueing (a task can be picked up for + * execution, dequeued for property changes, or reenqueued), but the BPF + * scheduler cannot directly iterate or move tasks from them. %SCX_DSQ_GLOBAL + * is similar but also doesn't support reenqueueing, as it maps to multiple + * per-node DSQs making the scope difficult to define; this may change in the + * future. * * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() * before trying to move from the specified DSQ. It may also grab rq locks and @@ -6117,21 +8187,24 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) * Returns %true if a task has been moved, %false if there isn't any task to * move. */ -__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) +__bpf_kfunc bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags, + const struct bpf_prog_aux *aux) { - struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; struct scx_sched *sch; + struct scx_dsp_ctx *dspc; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return false; - if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) + if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, &enq_flags)) return false; + dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; + flush_dispatch_buf(sch, dspc->rq); dsq = find_user_dsq(sch, dsq_id); @@ -6140,7 +8213,7 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) return false; } - if (consume_dispatch_q(sch, dspc->rq, dsq)) { + if (consume_dispatch_q(sch, dspc->rq, dsq, enq_flags)) { /* * A successfully consumed task can be dequeued before it starts * running while the CPU is trying to migrate other dispatched @@ -6154,6 +8227,14 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) } } +/* + * COMPAT: ___v2 was introduced in v7.1. Remove this and ___v2 tag in the future. + */ +__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux) +{ + return scx_bpf_dsq_move_to_local___v2(dsq_id, 0, aux); +} + /** * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs * @it__iter: DSQ iterator in progress @@ -6249,105 +8330,104 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } +#ifdef CONFIG_EXT_SUB_SCHED +/** + * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler + * @cgroup_id: cgroup ID of the child scheduler to dispatch + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * Allows a parent scheduler to trigger dispatching on one of its direct + * child schedulers. The child scheduler runs its dispatch operation to + * move tasks from dispatch queues to the local runqueue. + * + * Returns: true on success, false if cgroup_id is invalid, not a direct + * child, or caller lacks dispatch permission. + */ +__bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux) +{ + struct rq *this_rq = this_rq(); + struct scx_sched *parent, *child; + + guard(rcu)(); + parent = scx_prog_sched(aux); + if (unlikely(!parent)) + return false; + + child = scx_find_sub_sched(cgroup_id); + + if (unlikely(!child)) + return false; + + if (unlikely(scx_parent(child) != parent)) { + scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu", + cgroup_id); + return false; + } + + return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev, + true); +} +#endif /* CONFIG_EXT_SUB_SCHED */ + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_dispatch) -BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) -BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) +BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local___v2, KF_IMPLICIT_ARGS) +/* scx_bpf_dsq_move*() also in scx_kfunc_ids_unlocked: callable from unlocked contexts */ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) +#ifdef CONFIG_EXT_SUB_SCHED +BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS) +#endif BTF_KFUNCS_END(scx_kfunc_ids_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_dispatch, + .filter = scx_kfunc_context_filter, }; -static u32 reenq_local(struct rq *rq) -{ - LIST_HEAD(tasks); - u32 nr_enqueued = 0; - struct task_struct *p, *n; - - lockdep_assert_rq_held(rq); - - /* - * The BPF scheduler may choose to dispatch tasks back to - * @rq->scx.local_dsq. Move all candidate tasks off to a private list - * first to avoid processing the same tasks repeatedly. - */ - list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, - scx.dsq_list.node) { - /* - * If @p is being migrated, @p's current CPU may not agree with - * its allowed CPUs and the migration_cpu_stop is about to - * deactivate and re-activate @p anyway. Skip re-enqueueing. - * - * While racing sched property changes may also dequeue and - * re-enqueue a migrating task while its current CPU and allowed - * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to - * the current local DSQ for running tasks and thus are not - * visible to the BPF scheduler. - */ - if (p->migration_pending) - continue; - - dispatch_dequeue(rq, p); - list_add_tail(&p->scx.dsq_list.node, &tasks); - } - - list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { - list_del_init(&p->scx.dsq_list.node); - do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); - nr_enqueued++; - } - - return nr_enqueued; -} - __bpf_kfunc_start_defs(); /** * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Iterate over all of the tasks currently enqueued on the local DSQ of the * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of * processed tasks. Can only be called from ops.cpu_release(). - * - * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void - * returning variant that can be called from anywhere. */ -__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +__bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux) { struct scx_sched *sch; struct rq *rq; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return 0; - if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) - return 0; - rq = cpu_rq(smp_processor_id()); lockdep_assert_rq_held(rq); - return reenq_local(rq); + return reenq_local(sch, rq, SCX_REENQ_ANY); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) -BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local, KF_IMPLICIT_ARGS) BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_cpu_release, + .filter = scx_kfunc_context_filter, }; __bpf_kfunc_start_defs(); @@ -6356,11 +8436,12 @@ __bpf_kfunc_start_defs(); * scx_bpf_create_dsq - Create a custom DSQ * @dsq_id: DSQ to create * @node: NUMA node to allocate from + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. */ -__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) +__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_aux *aux) { struct scx_dispatch_q *dsq; struct scx_sched *sch; @@ -6377,36 +8458,54 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) if (!dsq) return -ENOMEM; - init_dsq(dsq, dsq_id); + /* + * init_dsq() must be called in GFP_KERNEL context. Init it with NULL + * @sch and update afterwards. + */ + ret = init_dsq(dsq, dsq_id, NULL); + if (ret) { + kfree(dsq); + return ret; + } rcu_read_lock(); - sch = rcu_dereference(scx_root); - if (sch) + sch = scx_prog_sched(aux); + if (sch) { + dsq->sched = sch; ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, dsq_hash_params); - else + } else { ret = -ENODEV; + } rcu_read_unlock(); - if (ret) + if (ret) { + exit_dsq(dsq); kfree(dsq); + } return ret; } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_unlocked) -BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) +BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_IMPLICIT_ARGS | KF_SLEEPABLE) +/* also in scx_kfunc_ids_dispatch: also callable from ops.dispatch() */ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) +/* also in scx_kfunc_ids_select_cpu: also callable from ops.select_cpu()/ops.enqueue() */ +BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_unlocked) static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_unlocked, + .filter = scx_kfunc_context_filter, }; __bpf_kfunc_start_defs(); @@ -6415,12 +8514,21 @@ __bpf_kfunc_start_defs(); * scx_bpf_task_set_slice - Set task's time slice * @p: task of interest * @slice: time slice to set in nsecs + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Set @p's time slice to @slice. Returns %true on success, %false if the * calling scheduler doesn't have authority over @p. */ -__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice) +__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, + const struct bpf_prog_aux *aux) { + struct scx_sched *sch; + + guard(rcu)(); + sch = scx_prog_sched(aux); + if (unlikely(!scx_task_on_sched(sch, p))) + return false; + p->scx.slice = slice; return true; } @@ -6429,12 +8537,21 @@ __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice) * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering * @p: task of interest * @vtime: virtual time to set + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Set @p's virtual time to @vtime. Returns %true on success, %false if the * calling scheduler doesn't have authority over @p. */ -__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime) +__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, + const struct bpf_prog_aux *aux) { + struct scx_sched *sch; + + guard(rcu)(); + sch = scx_prog_sched(aux); + if (unlikely(!scx_task_on_sched(sch, p))) + return false; + p->scx.dsq_vtime = vtime; return true; } @@ -6456,7 +8573,7 @@ static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) * lead to irq_work_queue() malfunction such as infinite busy wait for * IRQ status update. Suppress kicking. */ - if (scx_rq_bypassing(this_rq)) + if (scx_bypassing(sch, cpu_of(this_rq))) goto out; /* @@ -6496,18 +8613,19 @@ out: * scx_bpf_kick_cpu - Trigger reschedule on a CPU * @cpu: cpu to kick * @flags: %SCX_KICK_* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or * trigger rescheduling on a busy CPU. This can be called from any online * scx_ops operation and the actual kicking is performed asynchronously through * an irq work. */ -__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (likely(sch)) scx_kick_cpu(sch, cpu, flags); } @@ -6581,13 +8699,14 @@ __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) * @it: iterator to initialize * @dsq_id: DSQ to iterate * @flags: %SCX_DSQ_ITER_* + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Initialize BPF iterator @it which can be used with bpf_for_each() to walk * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes * tasks which are already queued when this function is invoked. */ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, - u64 flags) + u64 flags, const struct bpf_prog_aux *aux) { struct bpf_iter_scx_dsq_kern *kit = (void *)it; struct scx_sched *sch; @@ -6605,7 +8724,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, */ kit->dsq = NULL; - sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held()); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -6616,8 +8735,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (!kit->dsq) return -ENOENT; - kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags, - READ_ONCE(kit->dsq->seq)); + kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags); return 0; } @@ -6631,41 +8749,13 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) { struct bpf_iter_scx_dsq_kern *kit = (void *)it; - bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV; - struct task_struct *p; - unsigned long flags; if (!kit->dsq) return NULL; - raw_spin_lock_irqsave(&kit->dsq->lock, flags); - - if (list_empty(&kit->cursor.node)) - p = NULL; - else - p = container_of(&kit->cursor, struct task_struct, scx.dsq_list); + guard(raw_spinlock_irqsave)(&kit->dsq->lock); - /* - * Only tasks which were queued before the iteration started are - * visible. This bounds BPF iterations and guarantees that vtime never - * jumps in the other direction while iterating. - */ - do { - p = nldsq_next_task(kit->dsq, p, rev); - } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq))); - - if (p) { - if (rev) - list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node); - else - list_move(&kit->cursor.node, &p->scx.dsq_list.node); - } else { - list_del_init(&kit->cursor.node); - } - - raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); - - return p; + return nldsq_cursor_next_task(&kit->cursor, kit->dsq); } /** @@ -6694,6 +8784,7 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) /** * scx_bpf_dsq_peek - Lockless peek at the first element. * @dsq_id: DSQ to examine. + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Read the first element in the DSQ. This is semantically equivalent to using * the DSQ iterator, but is lockfree. Of course, like any lockless operation, @@ -6702,12 +8793,13 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) * * Returns the pointer, or NULL indicates an empty queue OR internal error. */ -__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) +__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; struct scx_dispatch_q *dsq; - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return NULL; @@ -6725,6 +8817,62 @@ __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) return rcu_dereference(dsq->first_task); } +/** + * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ + * @dsq_id: DSQ to re-enqueue + * @reenq_flags: %SCX_RENQ_* + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * Iterate over all of the tasks currently enqueued on the DSQ identified by + * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are + * supported: + * + * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) + * - User DSQs + * + * Re-enqueues are performed asynchronously. Can be called from anywhere. + */ +__bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags, + const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + struct scx_dispatch_q *dsq; + + guard(preempt)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return; + + if (unlikely(reenq_flags & ~__SCX_REENQ_USER_MASK)) { + scx_error(sch, "invalid SCX_REENQ flags 0x%llx", reenq_flags); + return; + } + + /* not specifying any filter bits is the same as %SCX_REENQ_ANY */ + if (!(reenq_flags & __SCX_REENQ_FILTER_MASK)) + reenq_flags |= SCX_REENQ_ANY; + + dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id()); + schedule_dsq_reenq(sch, dsq, reenq_flags, scx_locked_rq()); +} + +/** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from + * anywhere. + * + * This is now a special case of scx_bpf_dsq_reenq() and may be removed in the + * future. + */ +__bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) +{ + scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0, aux); +} + __bpf_kfunc_end_defs(); static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, @@ -6779,18 +8927,20 @@ __bpf_kfunc_start_defs(); * @fmt: error message format string * @data: format string parameters packaged using ___bpf_fill() macro * @data__sz: @data len, must end in '__sz' for the verifier + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops * disabling. */ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, - unsigned long long *data, u32 data__sz) + unsigned long long *data, u32 data__sz, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - sch = rcu_dereference_bh(scx_root); + sch = scx_prog_sched(aux); if (likely(sch) && bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); @@ -6802,18 +8952,19 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, * @fmt: error message format string * @data: format string parameters packaged using ___bpf_fill() macro * @data__sz: @data len, must end in '__sz' for the verifier + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Indicate that the BPF scheduler encountered a fatal error and initiate ops * disabling. */ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, - u32 data__sz) + u32 data__sz, const struct bpf_prog_aux *aux) { struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - sch = rcu_dereference_bh(scx_root); + sch = scx_prog_sched(aux); if (likely(sch) && bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); @@ -6825,6 +8976,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, * @fmt: format string * @data: format string parameters packaged using ___bpf_fill() macro * @data__sz: @data len, must end in '__sz' for the verifier + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and * dump_task() to generate extra debug dump specific to the BPF scheduler. @@ -6833,7 +8985,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, * multiple calls. The last line is automatically terminated. */ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, - u32 data__sz) + u32 data__sz, const struct bpf_prog_aux *aux) { struct scx_sched *sch; struct scx_dump_data *dd = &scx_dump_data; @@ -6842,7 +8994,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return; @@ -6879,38 +9031,21 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, } /** - * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ - * - * Iterate over all of the tasks currently enqueued on the local DSQ of the - * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from - * anywhere. - */ -__bpf_kfunc void scx_bpf_reenqueue_local___v2(void) -{ - struct rq *rq; - - guard(preempt)(); - - rq = this_rq(); - local_set(&rq->scx.reenq_local_deferred, 1); - schedule_deferred(rq); -} - -/** * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU * @cpu: CPU of interest + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Return the maximum relative capacity of @cpu in relation to the most * performant CPU in the system. The return value is in the range [1, * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). */ -__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) +__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_cpu_capacity(cpu); else @@ -6920,6 +9055,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) /** * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU * @cpu: CPU of interest + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Return the current relative performance of @cpu in relation to its maximum. * The return value is in the range [1, %SCX_CPUPERF_ONE]. @@ -6931,13 +9067,13 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) * * The result is in the range [1, %SCX_CPUPERF_ONE]. */ -__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) +__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_freq_capacity(cpu); else @@ -6948,6 +9084,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) * scx_bpf_cpuperf_set - Set the relative performance target of a CPU * @cpu: CPU of interest * @perf: target performance level [0, %SCX_CPUPERF_ONE] + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Set the target performance level of @cpu to @perf. @perf is in linear * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the @@ -6958,13 +9095,13 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) * use. Consult hardware and cpufreq documentation for more information. The * current performance level can be monitored using scx_bpf_cpuperf_cur(). */ -__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) +__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return; @@ -7074,14 +9211,15 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) /** * scx_bpf_cpu_rq - Fetch the rq of a CPU * @cpu: CPU of the rq + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs */ -__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) +__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return NULL; @@ -7100,18 +9238,19 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) /** * scx_bpf_locked_rq - Return the rq currently locked by SCX + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Returns the rq if a rq lock is currently held by SCX. * Otherwise emits an error and returns NULL. */ -__bpf_kfunc struct rq *scx_bpf_locked_rq(void) +__bpf_kfunc struct rq *scx_bpf_locked_rq(const struct bpf_prog_aux *aux) { struct scx_sched *sch; struct rq *rq; guard(preempt)(); - sch = rcu_dereference_sched(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return NULL; @@ -7127,16 +9266,17 @@ __bpf_kfunc struct rq *scx_bpf_locked_rq(void) /** * scx_bpf_cpu_curr - Return remote CPU's curr task * @cpu: CPU of interest + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Callers must hold RCU read lock (KF_RCU). */ -__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu) +__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return NULL; @@ -7147,41 +9287,6 @@ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu) } /** - * scx_bpf_task_cgroup - Return the sched cgroup of a task - * @p: task of interest - * - * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with - * from the scheduler's POV. SCX operations should use this function to - * determine @p's current cgroup as, unlike following @p->cgroups, - * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all - * rq-locked operations. Can be called on the parameter tasks of rq-locked - * operations. The restriction guarantees that @p's rq is locked by the caller. - */ -#ifdef CONFIG_CGROUP_SCHED -__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) -{ - struct task_group *tg = p->sched_task_group; - struct cgroup *cgrp = &cgrp_dfl_root.cgrp; - struct scx_sched *sch; - - guard(rcu)(); - - sch = rcu_dereference(scx_root); - if (unlikely(!sch)) - goto out; - - if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p)) - goto out; - - cgrp = tg_cgrp(tg); - -out: - cgroup_get(cgrp); - return cgrp; -} -#endif - -/** * scx_bpf_now - Returns a high-performance monotonically non-decreasing * clock for the current CPU. The clock returned is in nanoseconds. * @@ -7257,10 +9362,14 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED); + scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT); scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); + scx_agg_event(events, e_cpu, SCX_EV_INSERT_NOT_OWNED); + scx_agg_event(events, e_cpu, SCX_EV_SUB_BYPASS_DISPATCH); } } @@ -7294,25 +9403,62 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, memcpy(events, &e_sys, events__sz); } +#ifdef CONFIG_CGROUP_SCHED +/** + * scx_bpf_task_cgroup - Return the sched cgroup of a task + * @p: task of interest + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with + * from the scheduler's POV. SCX operations should use this function to + * determine @p's current cgroup as, unlike following @p->cgroups, + * @p->sched_task_group is stable for the duration of the SCX op. See + * SCX_CALL_OP_TASK() for details. + */ +__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p, + const struct bpf_prog_aux *aux) +{ + struct task_group *tg = p->sched_task_group; + struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + struct scx_sched *sch; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + goto out; + + if (!scx_kf_arg_task_ok(sch, p)) + goto out; + + cgrp = tg_cgrp(tg); + +out: + cgroup_get(cgrp); + return cgrp; +} +#endif /* CONFIG_CGROUP_SCHED */ + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) -BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU); -BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU); -BTF_ID_FLAGS(func, scx_bpf_kick_cpu) +BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); +BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); +BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) -BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) +BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, scx_bpf_exit_bstr) -BTF_ID_FLAGS(func, scx_bpf_error_bstr) -BTF_ID_FLAGS(func, scx_bpf_dump_bstr) -BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) -BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) -BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) -BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) +BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) @@ -7320,14 +9466,14 @@ BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_cpu_rq) -BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL) -BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) -#ifdef CONFIG_CGROUP_SCHED -BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) -#endif +BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, scx_bpf_now) BTF_ID_FLAGS(func, scx_bpf_events) +#ifdef CONFIG_CGROUP_SCHED +BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_IMPLICIT_ARGS | KF_RCU | KF_ACQUIRE) +#endif BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { @@ -7335,6 +9481,115 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = { .set = &scx_kfunc_ids_any, }; +/* + * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc + * group; an op may permit zero or more groups, with the union expressed in + * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter()) + * consults this table to decide whether a context-sensitive kfunc is callable + * from a given SCX op. + */ +enum scx_kf_allow_flags { + SCX_KF_ALLOW_UNLOCKED = 1 << 0, + SCX_KF_ALLOW_CPU_RELEASE = 1 << 1, + SCX_KF_ALLOW_DISPATCH = 1 << 2, + SCX_KF_ALLOW_ENQUEUE = 1 << 3, + SCX_KF_ALLOW_SELECT_CPU = 1 << 4, +}; + +/* + * Map each SCX op to the union of kfunc groups it permits, indexed by + * SCX_OP_IDX(op). Ops not listed only permit kfuncs that are not + * context-sensitive. + */ +static const u32 scx_kf_allow_flags[] = { + [SCX_OP_IDX(select_cpu)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, + [SCX_OP_IDX(enqueue)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, + [SCX_OP_IDX(dispatch)] = SCX_KF_ALLOW_ENQUEUE | SCX_KF_ALLOW_DISPATCH, + [SCX_OP_IDX(cpu_release)] = SCX_KF_ALLOW_CPU_RELEASE, + [SCX_OP_IDX(init_task)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(dump)] = SCX_KF_ALLOW_UNLOCKED, +#ifdef CONFIG_EXT_GROUP_SCHED + [SCX_OP_IDX(cgroup_init)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cgroup_exit)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cgroup_prep_move)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cgroup_cancel_move)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cgroup_set_weight)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cgroup_set_bandwidth)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cgroup_set_idle)] = SCX_KF_ALLOW_UNLOCKED, +#endif /* CONFIG_EXT_GROUP_SCHED */ + [SCX_OP_IDX(sub_attach)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED, +}; + +/* + * Verifier-time filter for context-sensitive SCX kfuncs. Registered via the + * .filter field on each per-group btf_kfunc_id_set. The BPF core invokes this + * for every kfunc call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or + * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the + * kfunc - so the filter must short-circuit on kfuncs it doesn't govern (e.g. + * scx_kfunc_ids_any) by falling through to "allow" when none of the + * context-sensitive sets contain the kfunc. + */ +int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id); + bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id); + bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); + bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); + bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); + u32 moff, flags; + + /* Not a context-sensitive kfunc (e.g. from scx_kfunc_ids_any) - allow. */ + if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release)) + return 0; + + /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */ + if (prog->type == BPF_PROG_TYPE_SYSCALL) + return (in_unlocked || in_select_cpu) ? 0 : -EACCES; + + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + return -EACCES; + + /* + * add_subprog_and_kfunc() collects all kfunc calls, including dead code + * guarded by bpf_ksym_exists(), before check_attach_btf_id() sets + * prog->aux->st_ops. Allow all kfuncs when st_ops is not yet set; + * do_check_main() re-runs the filter with st_ops set and enforces the + * actual restrictions. + */ + if (!prog->aux->st_ops) + return 0; + + /* + * Non-SCX struct_ops: only unlocked kfuncs are safe. The other + * context-sensitive kfuncs assume the rq lock is held by the SCX + * dispatch path, which doesn't apply to other struct_ops users. + */ + if (prog->aux->st_ops != &bpf_sched_ext_ops) + return in_unlocked ? 0 : -EACCES; + + /* SCX struct_ops: check the per-op allow list. */ + moff = prog->aux->attach_st_ops_member_off; + flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)]; + + if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked) + return 0; + if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release) + return 0; + if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch) + return 0; + if ((flags & SCX_KF_ALLOW_ENQUEUE) && in_enqueue) + return 0; + if ((flags & SCX_KF_ALLOW_SELECT_CPU) && in_select_cpu) + return 0; + + return -EACCES; +} + static int __init scx_init(void) { int ret; @@ -7344,11 +9599,12 @@ static int __init scx_init(void) * register_btf_kfunc_id_set() needs most of the system to be up. * * Some kfuncs are context-sensitive and can only be called from - * specific SCX ops. They are grouped into BTF sets accordingly. - * Unfortunately, BPF currently doesn't have a way of enforcing such - * restrictions. Eventually, the verifier should be able to enforce - * them. For now, register them the same and make each kfunc explicitly - * check using scx_kf_allowed(). + * specific SCX ops. They are grouped into per-context BTF sets, each + * registered with scx_kfunc_context_filter as its .filter callback. The + * BPF core dedups identical filter pointers per hook + * (btf_populate_kfunc_set()), so the filter is invoked exactly once per + * kfunc lookup; it consults scx_kf_allow_flags[] to enforce per-op + * restrictions at verify time. */ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_enqueue_dispatch)) || diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 43429b33e52c..0b7fc46aee08 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -11,7 +11,7 @@ void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); -int scx_fork(struct task_struct *p); +int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs); void scx_post_fork(struct task_struct *p); void scx_cancel_fork(struct task_struct *p); bool scx_can_stop_tick(struct rq *rq); @@ -44,7 +44,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, static inline void scx_tick(struct rq *rq) {} static inline void scx_pre_fork(struct task_struct *p) {} -static inline int scx_fork(struct task_struct *p) { return 0; } +static inline int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; } static inline void scx_post_fork(struct task_struct *p) {} static inline void scx_cancel_fork(struct task_struct *p) {} static inline u32 scx_cpuperf_target(s32 cpu) { return 0; } diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index c5a3b0bac7c3..443d12a3df67 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -368,7 +368,7 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) /* * Enable NUMA optimization only when there are multiple NUMA domains - * among the online CPUs and the NUMA domains don't perfectly overlaps + * among the online CPUs and the NUMA domains don't perfectly overlap * with the LLC domains. * * If all CPUs belong to the same NUMA node and the same LLC domain, @@ -424,18 +424,24 @@ static inline bool task_affinity_all(const struct task_struct *p) * - prefer the last used CPU to take advantage of cached data (L1, L2) and * branch prediction optimizations. * - * 3. Pick a CPU within the same LLC (Last-Level Cache): + * 3. Prefer @prev_cpu's SMT sibling: + * - if @prev_cpu is busy and no fully idle core is available, try to + * place the task on an idle SMT sibling of @prev_cpu; keeping the + * task on the same core makes migration cheaper, preserves L1 cache + * locality and reduces wakeup latency. + * + * 4. Pick a CPU within the same LLC (Last-Level Cache): * - if the above conditions aren't met, pick a CPU that shares the same * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain * cache locality. * - * 4. Pick a CPU within the same NUMA node, if enabled: + * 5. Pick a CPU within the same NUMA node, if enabled: * - choose a CPU from the same NUMA node, if the node cpumask is a * subset of @cpus_allowed, to reduce memory access latency. * - * 5. Pick any idle CPU within the @cpus_allowed domain. + * 6. Pick any idle CPU within the @cpus_allowed domain. * - * Step 3 and 4 are performed only if the system has, respectively, + * Step 4 and 5 are performed only if the system has, respectively, * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs. * @@ -543,7 +549,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * piled up on it even if there is an idle core elsewhere on * the system. */ - waker_node = cpu_to_node(cpu); + waker_node = scx_cpu_node_if_enabled(cpu); if (!(current->flags & PF_EXITING) && cpu_rq(cpu)->scx.local_dsq.nr == 0 && (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && @@ -616,6 +622,20 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, goto out_unlock; } +#ifdef CONFIG_SCHED_SMT + /* + * Use @prev_cpu's sibling if it's idle. + */ + if (sched_smt_active()) { + for_each_cpu_and(cpu, cpu_smt_mask(prev_cpu), allowed) { + if (cpu == prev_cpu) + continue; + if (scx_idle_test_and_clear_cpu(cpu)) + goto out_unlock; + } + } +#endif + /* * Search for any idle CPU in the same LLC domain. */ @@ -663,9 +683,8 @@ void scx_idle_init_masks(void) BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL)); - /* Allocate per-node idle cpumasks */ - scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks, - num_possible_nodes()); + /* Allocate per-node idle cpumasks (use nr_node_ids for non-contiguous NUMA nodes) */ + scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks, nr_node_ids); BUG_ON(!scx_idle_node_masks); for_each_node(i) { @@ -768,8 +787,9 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) * either enqueue() sees the idle bit or update_idle() sees the task * that enqueue() queued. */ - if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle); + if (SCX_HAS_OP(sch, update_idle) && do_notify && + !scx_bypassing(sch, cpu_of(rq))) + SCX_CALL_OP(sch, update_idle, rq, cpu_of(rq), idle); } static void reset_idle_masks(struct sched_ext_ops *ops) @@ -861,33 +881,40 @@ static bool check_builtin_idle_enabled(struct scx_sched *sch) * code. * * We can't simply check whether @p->migration_disabled is set in a - * sched_ext callback, because migration is always disabled for the current - * task while running BPF code. + * sched_ext callback, because the BPF prolog (__bpf_prog_enter) may disable + * migration for the current task while running BPF code. + * + * Since the BPF prolog calls migrate_disable() only when CONFIG_PREEMPT_RCU + * is enabled (via rcu_read_lock_dont_migrate()), migration_disabled == 1 for + * the current task is ambiguous only in that case: it could be from the BPF + * prolog rather than a real migrate_disable() call. * - * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively - * disable and re-enable migration. For this reason, the current task - * inside a sched_ext callback is always a migration-disabled task. + * Without CONFIG_PREEMPT_RCU, the BPF prolog never calls migrate_disable(), + * so migration_disabled == 1 always means the task is truly + * migration-disabled. * - * Therefore, when @p->migration_disabled == 1, check whether @p is the - * current task or not: if it is, then migration was not disabled before - * entering the callback, otherwise migration was disabled. + * Therefore, when migration_disabled == 1 and CONFIG_PREEMPT_RCU is enabled, + * check whether @p is the current task or not: if it is, then migration was + * not disabled before entering the callback, otherwise migration was disabled. * * Returns true if @p is migration-disabled, false otherwise. */ static bool is_bpf_migration_disabled(const struct task_struct *p) { - if (p->migration_disabled == 1) - return p != current; - else - return p->migration_disabled; + if (p->migration_disabled == 1) { + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + return p != current; + return true; + } + return p->migration_disabled; } static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *allowed, u64 flags) { - struct rq *rq; - struct rq_flags rf; + unsigned long irq_flags; + bool we_locked = false; s32 cpu; if (!ops_cpu_valid(sch, prev_cpu, NULL)) @@ -897,27 +924,20 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, return -EBUSY; /* - * If called from an unlocked context, acquire the task's rq lock, - * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed. + * Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq + * lock or @p's pi_lock. Three cases: * - * Otherwise, allow to use this kfunc only from ops.select_cpu() - * and ops.select_enqueue(). - */ - if (scx_kf_allowed_if_unlocked()) { - rq = task_rq_lock(p, &rf); - } else { - if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) - return -EPERM; - rq = scx_locked_rq(); - } - - /* - * Validate locking correctness to access p->cpus_ptr and - * p->nr_cpus_allowed: if we're holding an rq lock, we're safe; - * otherwise, assert that p->pi_lock is held. + * - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock. + * - other rq-locked SCX op: scx_locked_rq() points at the held rq. + * - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops): + * nothing held, take pi_lock ourselves. */ - if (!rq) + if (this_rq()->scx.in_select_cpu) { lockdep_assert_held(&p->pi_lock); + } else if (!scx_locked_rq()) { + raw_spin_lock_irqsave(&p->pi_lock, irq_flags); + we_locked = true; + } /* * This may also be called from ops.enqueue(), so we need to handle @@ -936,8 +956,8 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, allowed ?: p->cpus_ptr, flags); } - if (scx_kf_allowed_if_unlocked()) - task_rq_unlock(rq, p, &rf); + if (we_locked) + raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); return cpu; } @@ -946,14 +966,15 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or * trigger an error if @cpu is invalid * @cpu: target CPU + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs */ -__bpf_kfunc int scx_bpf_cpu_node(s32 cpu) +__bpf_kfunc s32 scx_bpf_cpu_node(s32 cpu, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL)) return NUMA_NO_NODE; return cpu_to_node(cpu); @@ -965,6 +986,7 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) * @prev_cpu: CPU @p was on previously * @wake_flags: %SCX_WAKE_* flags * @is_idle: out parameter indicating whether the returned CPU is idle + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked * context such as a BPF test_run() call, as long as built-in CPU selection @@ -975,14 +997,15 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) * currently idle and thus a good candidate for direct dispatching. */ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *is_idle) + u64 wake_flags, bool *is_idle, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; s32 cpu; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -1010,6 +1033,7 @@ struct scx_bpf_select_cpu_and_args { * @args->prev_cpu: CPU @p was on previously * @args->wake_flags: %SCX_WAKE_* flags * @args->flags: %SCX_PICK_IDLE* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided @@ -1028,13 +1052,14 @@ struct scx_bpf_select_cpu_and_args { */ __bpf_kfunc s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed, - struct scx_bpf_select_cpu_and_args *args) + struct scx_bpf_select_cpu_and_args *args, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -1056,6 +1081,17 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 if (unlikely(!sch)) return -ENODEV; +#ifdef CONFIG_EXT_SUB_SCHED + /* + * Disallow if any sub-scheds are attached. There is no way to tell + * which scheduler called us, just error out @p's scheduler. + */ + if (unlikely(!list_empty(&sch->children))) { + scx_error(scx_task_sched(p), "__scx_bpf_select_cpu_and() must be used"); + return -EINVAL; + } +#endif + return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, cpus_allowed, flags); } @@ -1064,18 +1100,20 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the * idle-tracking per-CPU cpumask of a target NUMA node. * @node: target NUMA node + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Returns an empty cpumask if idle tracking is not enabled, if @node is * not valid, or running on a UP kernel. In this case the actual error will * be reported to the BPF scheduler via scx_error(). */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) +__bpf_kfunc const struct cpumask * +scx_bpf_get_idle_cpumask_node(s32 node, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return cpu_none_mask; @@ -1089,17 +1127,18 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) /** * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking * per-CPU cpumask. + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Returns an empty mask if idle tracking is not enabled, or running on a * UP kernel. */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return cpu_none_mask; @@ -1119,18 +1158,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be * used to determine if an entire physical core is free. * @node: target NUMA node + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Returns an empty cpumask if idle tracking is not enabled, if @node is * not valid, or running on a UP kernel. In this case the actual error will * be reported to the BPF scheduler via scx_error(). */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) +__bpf_kfunc const struct cpumask * +scx_bpf_get_idle_smtmask_node(s32 node, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return cpu_none_mask; @@ -1148,17 +1189,18 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, * per-physical-core cpumask. Can be used to determine if an entire physical * core is free. + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Returns an empty mask if idle tracking is not enabled, or running on a * UP kernel. */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return cpu_none_mask; @@ -1194,6 +1236,7 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) /** * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state * @cpu: cpu to test and clear idle for + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Returns %true if @cpu was idle and its idle state was successfully cleared. * %false otherwise. @@ -1201,13 +1244,13 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) * Unavailable if ops.update_idle() is implemented and * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. */ -__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return false; @@ -1225,6 +1268,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) * @cpus_allowed: Allowed cpumask * @node: target NUMA node * @flags: %SCX_PICK_IDLE_* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node. * @@ -1240,13 +1284,14 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set. */ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, - int node, u64 flags) + s32 node, u64 flags, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -1261,6 +1306,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu * @cpus_allowed: Allowed cpumask * @flags: %SCX_PICK_IDLE_CPU_* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu * number on success. -%EBUSY if no matching cpu was found. @@ -1280,13 +1326,13 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, * scx_bpf_pick_idle_cpu_node() instead. */ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, - u64 flags) + u64 flags, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -1307,6 +1353,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, * @cpus_allowed: Allowed cpumask * @node: target NUMA node * @flags: %SCX_PICK_IDLE_CPU_* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu @@ -1323,14 +1370,15 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, * CPU. */ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, - int node, u64 flags) + s32 node, u64 flags, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; s32 cpu; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -1356,6 +1404,7 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU * @cpus_allowed: Allowed cpumask * @flags: %SCX_PICK_IDLE_CPU_* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu @@ -1370,14 +1419,14 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, * scx_bpf_pick_any_cpu_node() instead. */ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, - u64 flags) + u64 flags, const struct bpf_prog_aux *aux) { struct scx_sched *sch; s32 cpu; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -1402,20 +1451,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_idle) -BTF_ID_FLAGS(func, scx_bpf_cpu_node) -BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_cpu_node, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_IMPLICIT_ARGS | KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) -BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_idle) static const struct btf_kfunc_id_set scx_kfunc_set_idle = { @@ -1423,13 +1469,38 @@ static const struct btf_kfunc_id_set scx_kfunc_set_idle = { .set = &scx_kfunc_ids_idle, }; +/* + * The select_cpu kfuncs internally call task_rq_lock() when invoked from an + * rq-unlocked context, and thus cannot be safely called from arbitrary tracing + * contexts where @p's pi_lock state is unknown. Keep them out of + * BPF_PROG_TYPE_TRACING by registering them in their own set which is exposed + * only to STRUCT_OPS and SYSCALL programs. + * + * These kfuncs are also members of scx_kfunc_ids_unlocked (see ext.c) because + * they're callable from unlocked contexts in addition to ops.select_cpu() and + * ops.enqueue(). + */ +BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) +BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) + +static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_select_cpu, + .filter = scx_kfunc_context_filter, +}; + int scx_idle_init(void) { int ret; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || - register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu); return ret; } diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index fa583f141f35..dc35f850481e 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -12,6 +12,8 @@ struct sched_ext_ops; +extern struct btf_id_set8 scx_kfunc_ids_select_cpu; + void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); void scx_idle_init_masks(void); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 386c677e4c9a..62ce4eaf6a3f 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -6,6 +6,7 @@ * Copyright (c) 2025 Tejun Heo <tj@kernel.org> */ #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) +#define SCX_MOFF_IDX(moff) ((moff) / sizeof(void (*)(void))) enum scx_consts { SCX_DSP_DFL_MAX_BATCH = 32, @@ -24,10 +25,16 @@ enum scx_consts { */ SCX_TASK_ITER_BATCH = 32, + SCX_BYPASS_HOST_NTH = 2, + SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, SCX_BYPASS_LB_DONOR_PCT = 125, SCX_BYPASS_LB_MIN_DELTA_DIV = 4, SCX_BYPASS_LB_BATCH = 256, + + SCX_REENQ_LOCAL_MAX_REPEAT = 256, + + SCX_SUB_MAX_DEPTH = 4, }; enum scx_exit_kind { @@ -38,6 +45,7 @@ enum scx_exit_kind { SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + SCX_EXIT_PARENT, /* parent exiting */ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ @@ -62,6 +70,7 @@ enum scx_exit_kind { enum scx_exit_code { /* Reasons */ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + SCX_ECODE_RSN_CGROUP_OFFLINE = 2LLU << 32, /* Actions */ SCX_ECODE_ACT_RESTART = 1LLU << 48, @@ -74,7 +83,7 @@ enum scx_exit_flags { * info communication. The following flag indicates whether ops.init() * finished successfully. */ - SCX_EFLAG_INITIALIZED, + SCX_EFLAG_INITIALIZED = 1LLU << 0, }; /* @@ -175,9 +184,10 @@ enum scx_ops_flags { SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, /* - * CPU cgroup support flags + * If set, %SCX_ENQ_IMMED is assumed to be set on all local DSQ + * enqueues. */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ + SCX_OPS_ALWAYS_ENQ_IMMED = 1LLU << 7, SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | @@ -186,7 +196,7 @@ enum scx_ops_flags { SCX_OPS_ALLOW_QUEUED_WAKEUP | SCX_OPS_SWITCH_PARTIAL | SCX_OPS_BUILTIN_IDLE_PER_NODE | - SCX_OPS_HAS_CGROUP_WEIGHT, + SCX_OPS_ALWAYS_ENQ_IMMED, /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, @@ -213,7 +223,7 @@ struct scx_exit_task_args { bool cancelled; }; -/* argument container for ops->cgroup_init() */ +/* argument container for ops.cgroup_init() */ struct scx_cgroup_init_args { /* the weight of the cgroup [1..10000] */ u32 weight; @@ -236,12 +246,12 @@ enum scx_cpu_preempt_reason { }; /* - * Argument container for ops->cpu_acquire(). Currently empty, but may be + * Argument container for ops.cpu_acquire(). Currently empty, but may be * expanded in the future. */ struct scx_cpu_acquire_args {}; -/* argument container for ops->cpu_release() */ +/* argument container for ops.cpu_release() */ struct scx_cpu_release_args { /* the reason the CPU was preempted */ enum scx_cpu_preempt_reason reason; @@ -250,9 +260,7 @@ struct scx_cpu_release_args { struct task_struct *task; }; -/* - * Informational context provided to dump operations. - */ +/* informational context provided to dump operations */ struct scx_dump_ctx { enum scx_exit_kind kind; s64 exit_code; @@ -261,6 +269,18 @@ struct scx_dump_ctx { u64 at_jiffies; }; +/* argument container for ops.sub_attach() */ +struct scx_sub_attach_args { + struct sched_ext_ops *ops; + char *cgroup_path; +}; + +/* argument container for ops.sub_detach() */ +struct scx_sub_detach_args { + struct sched_ext_ops *ops; + char *cgroup_path; +}; + /** * struct sched_ext_ops - Operation table for BPF scheduler implementation * @@ -721,6 +741,20 @@ struct sched_ext_ops { #endif /* CONFIG_EXT_GROUP_SCHED */ + /** + * @sub_attach: Attach a sub-scheduler + * @args: argument container, see the struct definition + * + * Return 0 to accept the sub-scheduler. -errno to reject. + */ + s32 (*sub_attach)(struct scx_sub_attach_args *args); + + /** + * @sub_detach: Detach a sub-scheduler + * @args: argument container, see the struct definition + */ + void (*sub_detach)(struct scx_sub_detach_args *args); + /* * All online ops must come before ops.cpu_online(). */ @@ -762,6 +796,10 @@ struct sched_ext_ops { */ void (*exit)(struct scx_exit_info *info); + /* + * Data fields must comes after all ops fields. + */ + /** * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch */ @@ -797,6 +835,12 @@ struct sched_ext_ops { u64 hotplug_seq; /** + * @cgroup_id: When >1, attach the scheduler as a sub-scheduler on the + * specified cgroup. + */ + u64 sub_cgroup_id; + + /** * @name: BPF scheduler's name * * Must be a non-zero valid BPF object name including only isalnum(), @@ -806,7 +850,7 @@ struct sched_ext_ops { char name[SCX_OPS_NAME_LEN]; /* internal use only, must be NULL */ - void *priv; + void __rcu *priv; }; enum scx_opi { @@ -854,6 +898,24 @@ struct scx_event_stats { s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; /* + * The number of times a task, enqueued on a local DSQ with + * SCX_ENQ_IMMED, was re-enqueued because the CPU was not available for + * immediate execution. + */ + s64 SCX_EV_REENQ_IMMED; + + /* + * The number of times a reenq of local DSQ caused another reenq of + * local DSQ. This can happen when %SCX_ENQ_IMMED races against a higher + * priority class task even if the BPF scheduler always satisfies the + * prerequisites for %SCX_ENQ_IMMED at the time of enqueue. However, + * that scenario is very unlikely and this count going up regularly + * indicates that the BPF scheduler is handling %SCX_ENQ_REENQ + * incorrectly causing recursive reenqueues. + */ + s64 SCX_EV_REENQ_LOCAL_REPEAT; + + /* * Total number of times a task's time slice was refilled with the * default value (SCX_SLICE_DFL). */ @@ -873,15 +935,77 @@ struct scx_event_stats { * The number of times the bypassing mode has been activated. */ s64 SCX_EV_BYPASS_ACTIVATE; + + /* + * The number of times the scheduler attempted to insert a task that it + * doesn't own into a DSQ. Such attempts are ignored. + * + * As BPF schedulers are allowed to ignore dequeues, it's difficult to + * tell whether such an attempt is from a scheduler malfunction or an + * ignored dequeue around sub-sched enabling. If this count keeps going + * up regardless of sub-sched enabling, it likely indicates a bug in the + * scheduler. + */ + s64 SCX_EV_INSERT_NOT_OWNED; + + /* + * The number of times tasks from bypassing descendants are scheduled + * from sub_bypass_dsq's. + */ + s64 SCX_EV_SUB_BYPASS_DISPATCH; +}; + +struct scx_sched; + +enum scx_sched_pcpu_flags { + SCX_SCHED_PCPU_BYPASSING = 1LLU << 0, +}; + +/* dispatch buf */ +struct scx_dsp_buf_ent { + struct task_struct *task; + unsigned long qseq; + u64 dsq_id; + u64 enq_flags; +}; + +struct scx_dsp_ctx { + struct rq *rq; + u32 cursor; + u32 nr_tasks; + struct scx_dsp_buf_ent buf[]; +}; + +struct scx_deferred_reenq_local { + struct list_head node; + u64 flags; + u64 seq; + u32 cnt; }; struct scx_sched_pcpu { + struct scx_sched *sch; + u64 flags; /* protected by rq lock */ + /* * The event counters are in a per-CPU variable to minimize the * accounting overhead. A system-wide view on the event counter is * constructed when requested by scx_bpf_events(). */ struct scx_event_stats event_stats; + + struct scx_deferred_reenq_local deferred_reenq_local; + struct scx_dispatch_q bypass_dsq; +#ifdef CONFIG_EXT_SUB_SCHED + u32 bypass_host_seq; +#endif + + /* must be the last entry - contains flex array */ + struct scx_dsp_ctx dsp_ctx; +}; + +struct scx_sched_pnode { + struct scx_dispatch_q global_dsq; }; struct scx_sched { @@ -897,15 +1021,50 @@ struct scx_sched { * per-node split isn't sufficient, it can be further split. */ struct rhashtable dsq_hash; - struct scx_dispatch_q **global_dsqs; + struct scx_sched_pnode **pnode; struct scx_sched_pcpu __percpu *pcpu; + u64 slice_dfl; + u64 bypass_timestamp; + s32 bypass_depth; + + /* bypass dispatch path enable state, see bypass_dsp_enabled() */ + unsigned long bypass_dsp_claim; + atomic_t bypass_dsp_enable_depth; + + bool aborting; + bool dump_disabled; /* protected by scx_dump_lock */ + u32 dsp_max_batch; + s32 level; + /* * Updates to the following warned bitfields can race causing RMW issues * but it doesn't really matter. */ bool warned_zero_slice:1; bool warned_deprecated_rq:1; + bool warned_unassoc_progs:1; + + struct list_head all; + +#ifdef CONFIG_EXT_SUB_SCHED + struct rhash_head hash_node; + + struct list_head children; + struct list_head sibling; + struct cgroup *cgrp; + char *cgrp_path; + struct kset *sub_kset; + + bool sub_attached; +#endif /* CONFIG_EXT_SUB_SCHED */ + + /* + * The maximum amount of time in jiffies that a task may be runnable + * without being scheduled on a CPU. If this timeout is exceeded, it + * will trigger scx_error(). + */ + unsigned long watchdog_timeout; atomic_t exit_kind; struct scx_exit_info *exit_info; @@ -913,9 +1072,13 @@ struct scx_sched { struct kobject kobj; struct kthread_worker *helper; - struct irq_work error_irq_work; + struct irq_work disable_irq_work; struct kthread_work disable_work; + struct timer_list bypass_lb_timer; struct rcu_work rcu_work; + + /* all ancestors including self */ + struct scx_sched *ancestors[]; }; enum scx_wake_flags { @@ -942,13 +1105,27 @@ enum scx_enq_flags { SCX_ENQ_PREEMPT = 1LLU << 32, /* - * The task being enqueued was previously enqueued on the current CPU's - * %SCX_DSQ_LOCAL, but was removed from it in a call to the - * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was - * invoked in a ->cpu_release() callback, and the task is again - * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the - * task will not be scheduled on the CPU until at least the next invocation - * of the ->cpu_acquire() callback. + * Only allowed on local DSQs. Guarantees that the task either gets + * on the CPU immediately and stays on it, or gets reenqueued back + * to the BPF scheduler. It will never linger on a local DSQ or be + * silently put back after preemption. + * + * The protection persists until the next fresh enqueue - it + * survives SAVE/RESTORE cycles, slice extensions and preemption. + * If the task can't stay on the CPU for any reason, it gets + * reenqueued back to the BPF scheduler. + * + * Exiting and migration-disabled tasks bypass ops.enqueue() and + * are placed directly on a local DSQ without IMMED protection + * unless %SCX_OPS_ENQ_EXITING and %SCX_OPS_ENQ_MIGRATION_DISABLED + * are set respectively. + */ + SCX_ENQ_IMMED = 1LLU << 33, + + /* + * The task being enqueued was previously enqueued on a DSQ, but was + * removed and is being re-enqueued. See SCX_TASK_REENQ_* flags to find + * out why a given task is being reenqueued. */ SCX_ENQ_REENQ = 1LLU << 40, @@ -969,6 +1146,7 @@ enum scx_enq_flags { SCX_ENQ_CLEAR_OPSS = 1LLU << 56, SCX_ENQ_DSQ_PRIQ = 1LLU << 57, SCX_ENQ_NESTED = 1LLU << 58, + SCX_ENQ_GDSQ_FALLBACK = 1LLU << 59, /* fell back to global DSQ */ }; enum scx_deq_flags { @@ -982,6 +1160,28 @@ enum scx_deq_flags { * it hasn't been dispatched yet. Dequeue from the BPF side. */ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, + + /* + * The task is being dequeued due to a property change (e.g., + * sched_setaffinity(), sched_setscheduler(), set_user_nice(), + * etc.). + */ + SCX_DEQ_SCHED_CHANGE = 1LLU << 33, +}; + +enum scx_reenq_flags { + /* low 16bits determine which tasks should be reenqueued */ + SCX_REENQ_ANY = 1LLU << 0, /* all tasks */ + + __SCX_REENQ_FILTER_MASK = 0xffffLLU, + + __SCX_REENQ_USER_MASK = SCX_REENQ_ANY, + + /* bits 32-35 used by task_should_reenq() */ + SCX_REENQ_TSR_RQ_OPEN = 1LLU << 32, + SCX_REENQ_TSR_NOT_FIRST = 1LLU << 33, + + __SCX_REENQ_TSR_MASK = 0xfLLU << 32, }; enum scx_pick_idle_cpu_flags { @@ -1035,26 +1235,108 @@ static const char *scx_enable_state_str[] = { }; /* - * sched_ext_entity->ops_state + * Task Ownership State Machine (sched_ext_entity->ops_state) * - * Used to track the task ownership between the SCX core and the BPF scheduler. - * State transitions look as follows: + * The sched_ext core uses this state machine to track task ownership + * between the SCX core and the BPF scheduler. This allows the BPF + * scheduler to dispatch tasks without strict ordering requirements, while + * the SCX core safely rejects invalid dispatches. * - * NONE -> QUEUEING -> QUEUED -> DISPATCHING - * ^ | | - * | v v - * \-------------------------------/ + * State Transitions * - * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call - * sites for explanations on the conditions being waited upon and why they are - * safe. Transitions out of them into NONE or QUEUED must store_release and the - * waiters should load_acquire. + * .------------> NONE (owned by SCX core) + * | | ^ + * | enqueue | | direct dispatch + * | v | + * | QUEUEING -------' + * | | + * | enqueue | + * | completes | + * | v + * | QUEUED (owned by BPF scheduler) + * | | + * | dispatch | + * | | + * | v + * | DISPATCHING + * | | + * | dispatch | + * | completes | + * `---------------' * - * Tracking scx_ops_state enables sched_ext core to reliably determine whether - * any given task can be dispatched by the BPF scheduler at all times and thus - * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler - * to try to dispatch any task anytime regardless of its state as the SCX core - * can safely reject invalid dispatches. + * State Descriptions + * + * - %SCX_OPSS_NONE: + * Task is owned by the SCX core. It's either on a run queue, running, + * or being manipulated by the core scheduler. The BPF scheduler has no + * claim on this task. + * + * - %SCX_OPSS_QUEUEING: + * Transitional state while transferring a task from the SCX core to + * the BPF scheduler. The task's rq lock is held during this state. + * Since QUEUEING is both entered and exited under the rq lock, dequeue + * can never observe this state (it would be a BUG). When finishing a + * dispatch, if the task is still in %SCX_OPSS_QUEUEING the completion + * path busy-waits for it to leave this state (via wait_ops_state()) + * before retrying. + * + * - %SCX_OPSS_QUEUED: + * Task is owned by the BPF scheduler. It's on a DSQ (dispatch queue) + * and the BPF scheduler is responsible for dispatching it. A QSEQ + * (queue sequence number) is embedded in this state to detect + * dispatch/dequeue races: if a task is dequeued and re-enqueued, the + * QSEQ changes and any in-flight dispatch operations targeting the old + * QSEQ are safely ignored. + * + * - %SCX_OPSS_DISPATCHING: + * Transitional state while transferring a task from the BPF scheduler + * back to the SCX core. This state indicates the BPF scheduler has + * selected the task for execution. When dequeue needs to take the task + * off a DSQ and it is still in %SCX_OPSS_DISPATCHING, the dequeue path + * busy-waits for it to leave this state (via wait_ops_state()) before + * proceeding. Exits to %SCX_OPSS_NONE when dispatch completes. + * + * Memory Ordering + * + * Transitions out of %SCX_OPSS_QUEUEING and %SCX_OPSS_DISPATCHING into + * %SCX_OPSS_NONE or %SCX_OPSS_QUEUED must use atomic_long_set_release() + * and waiters must use atomic_long_read_acquire(). This ensures proper + * synchronization between concurrent operations. + * + * Cross-CPU Task Migration + * + * When moving a task in the %SCX_OPSS_DISPATCHING state, we can't simply + * grab the target CPU's rq lock because a concurrent dequeue might be + * waiting on %SCX_OPSS_DISPATCHING while holding the source rq lock + * (deadlock). + * + * The sched_ext core uses a "lock dancing" protocol coordinated by + * p->scx.holding_cpu. When moving a task to a different rq: + * + * 1. Verify task can be moved (CPU affinity, migration_disabled, etc.) + * 2. Set p->scx.holding_cpu to the current CPU + * 3. Set task state to %SCX_OPSS_NONE; dequeue waits while DISPATCHING + * is set, so clearing DISPATCHING first prevents the circular wait + * (safe to lock the rq we need) + * 4. Unlock the current CPU's rq + * 5. Lock src_rq (where the task currently lives) + * 6. Verify p->scx.holding_cpu == current CPU, if not, dequeue won the + * race (dequeue clears holding_cpu to -1 when it takes the task), in + * this case migration is aborted + * 7. If src_rq == dst_rq: clear holding_cpu and enqueue directly + * into dst_rq's local DSQ (no lock swap needed) + * 8. Otherwise: call move_remote_task_to_local_dsq(), which releases + * src_rq, locks dst_rq, and performs the deactivate/activate + * migration cycle (dst_rq is held on return) + * 9. Unlock dst_rq and re-lock the current CPU's rq to restore + * the lock state expected by the caller + * + * If any verification fails, abort the migration. + * + * This state tracking allows the BPF scheduler to try to dispatch any task + * at any time regardless of its state. The SCX core can safely + * reject/ignore invalid dispatches, simplifying the BPF scheduler + * implementation. */ enum scx_ops_state { SCX_OPSS_NONE, /* owned by the SCX core */ @@ -1079,8 +1361,11 @@ enum scx_ops_state { #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) +extern struct scx_sched __rcu *scx_root; DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); +int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id); + /* * Return the rq currently locked from an scx callback, or NULL if no rq is * locked. @@ -1090,12 +1375,107 @@ static inline struct rq *scx_locked_rq(void) return __this_cpu_read(scx_locked_rq_state); } -static inline bool scx_kf_allowed_if_unlocked(void) +static inline bool scx_bypassing(struct scx_sched *sch, s32 cpu) +{ + return unlikely(per_cpu_ptr(sch->pcpu, cpu)->flags & + SCX_SCHED_PCPU_BYPASSING); +} + +#ifdef CONFIG_EXT_SUB_SCHED +/** + * scx_task_sched - Find scx_sched scheduling a task + * @p: task of interest + * + * Return @p's scheduler instance. Must be called with @p's pi_lock or rq lock + * held. + */ +static inline struct scx_sched *scx_task_sched(const struct task_struct *p) +{ + return rcu_dereference_protected(p->scx.sched, + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(__rq_lockp(task_rq(p)))); +} + +/** + * scx_task_sched_rcu - Find scx_sched scheduling a task + * @p: task of interest + * + * Return @p's scheduler instance. The returned scx_sched is RCU protected. + */ +static inline struct scx_sched *scx_task_sched_rcu(const struct task_struct *p) +{ + return rcu_dereference_all(p->scx.sched); +} + +/** + * scx_task_on_sched - Is a task on the specified sched? + * @sch: sched to test against + * @p: task of interest + * + * Returns %true if @p is on @sch, %false otherwise. + */ +static inline bool scx_task_on_sched(struct scx_sched *sch, + const struct task_struct *p) +{ + return rcu_access_pointer(p->scx.sched) == sch; +} + +/** + * scx_prog_sched - Find scx_sched associated with a BPF prog + * @aux: aux passed in from BPF to a kfunc + * + * To be called from kfuncs. Return the scheduler instance associated with the + * BPF program given the implicit kfunc argument aux. The returned scx_sched is + * RCU protected. + */ +static inline struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux) +{ + struct sched_ext_ops *ops; + struct scx_sched *root; + + ops = bpf_prog_get_assoc_struct_ops(aux); + if (likely(ops)) + return rcu_dereference_all(ops->priv); + + root = rcu_dereference_all(scx_root); + if (root) { + /* + * COMPAT-v6.19: Schedulers built before sub-sched support was + * introduced may have unassociated non-struct_ops programs. + */ + if (!root->ops.sub_attach) + return root; + + if (!root->warned_unassoc_progs) { + printk_deferred(KERN_WARNING "sched_ext: Unassociated program %s (id %d)\n", + aux->name, aux->id); + root->warned_unassoc_progs = true; + } + } + + return NULL; +} +#else /* CONFIG_EXT_SUB_SCHED */ +static inline struct scx_sched *scx_task_sched(const struct task_struct *p) +{ + return rcu_dereference_protected(scx_root, + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(__rq_lockp(task_rq(p)))); +} + +static inline struct scx_sched *scx_task_sched_rcu(const struct task_struct *p) +{ + return rcu_dereference_all(scx_root); +} + +static inline bool scx_task_on_sched(struct scx_sched *sch, + const struct task_struct *p) { - return !current->scx.kf_mask; + return true; } -static inline bool scx_rq_bypassing(struct rq *rq) +static struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux) { - return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); + return rcu_dereference_all(scx_root); } +#endif /* CONFIG_EXT_SUB_SCHED */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eea99ec01a3f..69361c63353a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -225,6 +225,7 @@ void __init sched_init_granularity(void) update_sysctl(); } +#ifndef CONFIG_64BIT #define WMULT_CONST (~0U) #define WMULT_SHIFT 32 @@ -283,6 +284,12 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight return mul_u64_u32_shr(delta_exec, fact, shift); } +#else +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) +{ + return (delta_exec * weight) / lw->weight; +} +#endif /* * delta /= w @@ -589,6 +596,21 @@ static inline bool entity_before(const struct sched_entity *a, return vruntime_cmp(a->deadline, "<", b->deadline); } +/* + * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale + * and this value should be no more than two lag bounds. Which puts it in the + * general order of: + * + * (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT + * + * which is around 44 bits in size (on 64bit); that is 20 for + * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for + * however many msec the actual slice+tick ends up begin. + * + * (disregarding the actual divide-by-weight part makes for the worst case + * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually + * being the zero-lag point). + */ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime); @@ -650,25 +672,83 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * Since zero_vruntime closely tracks the per-task service, these * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag * induced in the system due to quantisation. - * - * Also, we use scale_load_down() to reduce the size. - * - * As measured, the max (key * weight) value was ~44 bits for a kernel build. */ +static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w) +{ +#ifdef CONFIG_64BIT + if (cfs_rq->sum_shift) + w = max(2UL, w >> cfs_rq->sum_shift); +#endif + return w; +} + +static inline void +__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); + s64 w_vruntime, key = entity_key(cfs_rq, se); + + w_vruntime = key * weight; + WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62)); + + cfs_rq->sum_w_vruntime += w_vruntime; + cfs_rq->sum_weight += weight; +} + static void -sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long weight = scale_load_down(se->load.weight); - s64 key = entity_key(cfs_rq, se); + unsigned long weight; + s64 key, tmp; + +again: + weight = avg_vruntime_weight(cfs_rq, se->load.weight); + key = entity_key(cfs_rq, se); + + if (check_mul_overflow(key, weight, &key)) + goto overflow; - cfs_rq->sum_w_vruntime += key * weight; + if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp)) + goto overflow; + + cfs_rq->sum_w_vruntime = tmp; cfs_rq->sum_weight += weight; + return; + +overflow: + /* + * There's gotta be a limit -- if we're still failing at this point + * there's really nothing much to be done about things. + */ + BUG_ON(cfs_rq->sum_shift >= 10); + cfs_rq->sum_shift++; + + /* + * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1 + */ + cfs_rq->sum_w_vruntime = 0; + cfs_rq->sum_weight = 0; + + for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost; + node; node = rb_next(node)) + __sum_w_vruntime_add(cfs_rq, __node_2_se(node)); + + goto again; +} + +static void +sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (sched_feat(PARANOID_AVG)) + return sum_w_vruntime_add_paranoid(cfs_rq, se); + + __sum_w_vruntime_add(cfs_rq, se); } static void sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long weight = scale_load_down(se->load.weight); + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); s64 key = entity_key(cfs_rq, se); cfs_rq->sum_w_vruntime -= key * weight; @@ -676,41 +756,65 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) } static inline -void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta) { /* - * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight + * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight */ cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta; + cfs_rq->zero_vruntime += delta; } /* - * Specifically: avg_runtime() + 0 must result in entity_eligible() := true + * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true * For this to be so, the result of this function must have a left bias. + * + * Called in: + * - place_entity() -- before enqueue + * - update_entity_lag() -- before dequeue + * - update_deadline() -- slice expiration + * + * This means it is one entry 'behind' but that puts it close enough to where + * the bound on entity_key() is at most two lag bounds. */ u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->sum_w_vruntime; - long load = cfs_rq->sum_weight; + long weight = cfs_rq->sum_weight; + s64 delta = 0; - if (curr && curr->on_rq) { - unsigned long weight = scale_load_down(curr->load.weight); + if (curr && !curr->on_rq) + curr = NULL; - avg += entity_key(cfs_rq, curr) * weight; - load += weight; - } + if (weight) { + s64 runtime = cfs_rq->sum_w_vruntime; + + if (curr) { + unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight); + + runtime += entity_key(cfs_rq, curr) * w; + weight += w; + } - if (load) { /* sign flips effective floor / ceiling */ - if (avg < 0) - avg -= (load - 1); - avg = div_s64(avg, load); + if (runtime < 0) + runtime -= (weight - 1); + + delta = div64_long(runtime, weight); + } else if (curr) { + /* + * When there is but one element, it is the average. + */ + delta = curr->vruntime - cfs_rq->zero_vruntime; } - return cfs_rq->zero_vruntime + avg; + update_zero_vruntime(cfs_rq, delta); + + return cfs_rq->zero_vruntime; } +static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq); + /* * lag_i = S - s_i = w_i * (V - v_i) * @@ -724,19 +828,45 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) * EEVDF gives the following limit for a steady state system: * * -r_max < lag < max(r_max, q) - * - * XXX could add max_slice to the augmented data to track this. */ -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime) { + u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC; s64 vlag, limit; + vlag = avruntime - se->vruntime; + limit = calc_delta_fair(max_slice, se); + + return clamp(vlag, -limit, limit); +} + +/* + * Delayed dequeue aims to reduce the negative lag of a dequeued task. While + * updating the lag of an entity, check that negative lag didn't increase + * during the delayed dequeue period which would be unfair. + * Similarly, check that the entity didn't gain positive lag when DELAY_ZERO + * is set. + * + * Return true if the lag has been adjusted. + */ +static __always_inline +bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq)); + bool ret; + WARN_ON_ONCE(!se->on_rq); - vlag = avg_vruntime(cfs_rq) - se->vruntime; - limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); + if (se->sched_delayed) { + /* previous vlag < 0 otherwise se would not be delayed */ + vlag = max(vlag, se->vlag); + if (sched_feat(DELAY_ZERO)) + vlag = min(vlag, 0); + } + ret = (vlag == se->vlag); + se->vlag = vlag; - se->vlag = clamp(vlag, -limit, limit); + return ret; } /* @@ -763,7 +893,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { - unsigned long weight = scale_load_down(curr->load.weight); + unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight); avg += entity_key(cfs_rq, curr) * weight; load += weight; @@ -777,16 +907,6 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) return vruntime_eligible(cfs_rq, se->vruntime); } -static void update_zero_vruntime(struct cfs_rq *cfs_rq) -{ - u64 vruntime = avg_vruntime(cfs_rq); - s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); - - sum_w_vruntime_update(cfs_rq, delta); - - cfs_rq->zero_vruntime = vruntime; -} - static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) { struct sched_entity *root = __pick_root_entity(cfs_rq); @@ -802,6 +922,21 @@ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) return min_slice; } +static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq) +{ + struct sched_entity *root = __pick_root_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + u64 max_slice = 0ULL; + + if (curr && curr->on_rq) + max_slice = curr->slice; + + if (root) + max_slice = max(max_slice, root->max_slice); + + return max_slice; +} + static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) { return entity_before(__node_2_se(a), __node_2_se(b)); @@ -826,6 +961,15 @@ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *n } } +static inline void __max_slice_update(struct sched_entity *se, struct rb_node *node) +{ + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (rse->max_slice > se->max_slice) + se->max_slice = rse->max_slice; + } +} + /* * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ @@ -833,6 +977,7 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { u64 old_min_vruntime = se->min_vruntime; u64 old_min_slice = se->min_slice; + u64 old_max_slice = se->max_slice; struct rb_node *node = &se->run_node; se->min_vruntime = se->vruntime; @@ -843,8 +988,13 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit) __min_slice_update(se, node->rb_right); __min_slice_update(se, node->rb_left); + se->max_slice = se->slice; + __max_slice_update(se, node->rb_right); + __max_slice_update(se, node->rb_left); + return se->min_vruntime == old_min_vruntime && - se->min_slice == old_min_slice; + se->min_slice == old_min_slice && + se->max_slice == old_max_slice; } RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, @@ -856,7 +1006,6 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { sum_w_vruntime_add(cfs_rq, se); - update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, @@ -868,7 +1017,6 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); sum_w_vruntime_sub(cfs_rq, se); - update_zero_vruntime(cfs_rq); } struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) @@ -968,7 +1116,7 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) /* * Picking the ->next buddy will affect latency but not fairness. */ - if (sched_feat(PICK_BUDDY) && + if (sched_feat(PICK_BUDDY) && protect && cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { /* ->next will never be delayed */ WARN_ON_ONCE(cfs_rq->next->sched_delayed); @@ -1075,6 +1223,7 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) * EEVDF: vd_i = ve_i + r_i / w_i */ se->deadline = se->vruntime + calc_delta_fair(se->slice, se); + avg_vruntime(cfs_rq); /* * The task has consumed its request, reschedule. @@ -3784,19 +3933,128 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) se_weight(se) * -se->avg.load_sum); } -static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); +static void +rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot) +{ + unsigned long old_weight = se->load.weight; + + /* + * VRUNTIME + * -------- + * + * COROLLARY #1: The virtual runtime of the entity needs to be + * adjusted if re-weight at !0-lag point. + * + * Proof: For contradiction assume this is not true, so we can + * re-weight without changing vruntime at !0-lag point. + * + * Weight VRuntime Avg-VRuntime + * before w v V + * after w' v' V' + * + * Since lag needs to be preserved through re-weight: + * + * lag = (V - v)*w = (V'- v')*w', where v = v' + * ==> V' = (V - v)*w/w' + v (1) + * + * Let W be the total weight of the entities before reweight, + * since V' is the new weighted average of entities: + * + * V' = (WV + w'v - wv) / (W + w' - w) (2) + * + * by using (1) & (2) we obtain: + * + * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v + * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v + * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v + * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) + * + * Since we are doing at !0-lag point which means V != v, we + * can simplify (3): + * + * ==> W / (W + w' - w) = w / w' + * ==> Ww' = Ww + ww' - ww + * ==> W * (w' - w) = w * (w' - w) + * ==> W = w (re-weight indicates w' != w) + * + * So the cfs_rq contains only one entity, hence vruntime of + * the entity @v should always equal to the cfs_rq's weighted + * average vruntime @V, which means we will always re-weight + * at 0-lag point, thus breach assumption. Proof completed. + * + * + * COROLLARY #2: Re-weight does NOT affect weighted average + * vruntime of all the entities. + * + * Proof: According to corollary #1, Eq. (1) should be: + * + * (V - v)*w = (V' - v')*w' + * ==> v' = V' - (V - v)*w/w' (4) + * + * According to the weighted average formula, we have: + * + * V' = (WV - wv + w'v') / (W - w + w') + * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') + * = (WV - wv + w'V' - Vw + wv) / (W - w + w') + * = (WV + w'V' - Vw) / (W - w + w') + * + * ==> V'*(W - w + w') = WV + w'V' - Vw + * ==> V' * (W - w) = (W - w) * V (5) + * + * If the entity is the only one in the cfs_rq, then reweight + * always occurs at 0-lag point, so V won't change. Or else + * there are other entities, hence W != w, then Eq. (5) turns + * into V' = V. So V won't change in either case, proof done. + * + * + * So according to corollary #1 & #2, the effect of re-weight + * on vruntime should be: + * + * v' = V' - (V - v) * w / w' (4) + * = V - (V - v) * w / w' + * = V - vl * w / w' + * = V - vl' + */ + se->vlag = div64_long(se->vlag * old_weight, weight); + + /* + * DEADLINE + * -------- + * + * When the weight changes, the virtual time slope changes and + * we should adjust the relative virtual deadline accordingly. + * + * d' = v' + (d - v)*w/w' + * = V' - (V - v)*w/w' + (d - v)*w/w' + * = V - (V - v)*w/w' + (d - v)*w/w' + * = V + (d - V)*w/w' + */ + if (se->rel_deadline) + se->deadline = div64_long(se->deadline * old_weight, weight); + + if (rel_vprot) + se->vprot = div64_long(se->vprot * old_weight, weight); +} static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; + bool rel_vprot = false; + u64 avruntime = 0; if (se->on_rq) { /* commit outstanding execution time */ update_curr(cfs_rq); - update_entity_lag(cfs_rq, se); - se->deadline -= se->vruntime; + avruntime = avg_vruntime(cfs_rq); + se->vlag = entity_lag(cfs_rq, se, avruntime); + se->deadline -= avruntime; se->rel_deadline = 1; + if (curr && protect_slice(se)) { + se->vprot -= avruntime; + rel_vprot = true; + } + cfs_rq->nr_queued--; if (!curr) __dequeue_entity(cfs_rq, se); @@ -3804,25 +4062,23 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } dequeue_load_avg(cfs_rq, se); - /* - * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), - * we need to scale se->vlag when w_i changes. - */ - se->vlag = div_s64(se->vlag * se->load.weight, weight); - if (se->rel_deadline) - se->deadline = div_s64(se->deadline * se->load.weight, weight); + rescale_entity(se, weight, rel_vprot); update_load_set(&se->load, weight); do { u32 divider = get_pelt_divider(&se->avg); - se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); enqueue_load_avg(cfs_rq, se); if (se->on_rq) { - place_entity(cfs_rq, se, 0); + if (rel_vprot) + se->vprot += avruntime; + se->deadline += avruntime; + se->rel_deadline = 0; + se->vruntime = avruntime - se->vlag; + update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); @@ -5096,6 +5352,7 @@ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { u64 vslice, vruntime = avg_vruntime(cfs_rq); + bool update_zero = false; s64 lag = 0; if (!se->custom_slice) @@ -5112,7 +5369,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { struct sched_entity *curr = cfs_rq->curr; - unsigned long load; + long load, weight; lag = se->vlag; @@ -5170,17 +5427,44 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ load = cfs_rq->sum_weight; if (curr && curr->on_rq) - load += scale_load_down(curr->load.weight); + load += avg_vruntime_weight(cfs_rq, curr->load.weight); - lag *= load + scale_load_down(se->load.weight); + weight = avg_vruntime_weight(cfs_rq, se->load.weight); + lag *= load + weight; if (WARN_ON_ONCE(!load)) load = 1; - lag = div_s64(lag, load); + lag = div64_long(lag, load); + + /* + * A heavy entity (relative to the tree) will pull the + * avg_vruntime close to its vruntime position on enqueue. But + * the zero_vruntime point is only updated at the next + * update_deadline()/place_entity()/update_entity_lag(). + * + * Specifically (see the comment near avg_vruntime_weight()): + * + * sum_w_vruntime = \Sum (v_i - v0) * w_i + * + * Note that if v0 is near a light entity, both terms will be + * small for the light entity, while in that case both terms + * are large for the heavy entity, leading to risk of + * overflow. + * + * OTOH if v0 is near the heavy entity, then the difference is + * larger for the light entity, but the factor is small, while + * for the heavy entity the difference is small but the factor + * is large. Avoiding the multiplication overflow. + */ + if (weight > load) + update_zero = true; } se->vruntime = vruntime - lag; - if (se->rel_deadline) { + if (update_zero) + update_zero_vruntime(cfs_rq, -lag); + + if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { se->deadline += se->vruntime; se->rel_deadline = 0; return; @@ -5330,13 +5614,6 @@ static void clear_delayed(struct sched_entity *se) } } -static inline void finish_delayed_dequeue_entity(struct sched_entity *se) -{ - clear_delayed(se); - if (sched_feat(DELAY_ZERO) && se->vlag > 0) - se->vlag = 0; -} - static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -5362,6 +5639,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { update_load_avg(cfs_rq, se, 0); + update_entity_lag(cfs_rq, se); set_delayed(se); return false; } @@ -5401,7 +5679,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_group(se); if (flags & DEQUEUE_DELAYED) - finish_delayed_dequeue_entity(se); + clear_delayed(se); if (cfs_rq->nr_queued == 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); @@ -5420,7 +5698,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } static void -set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first) { clear_buddies(cfs_rq, se); @@ -5435,7 +5713,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - set_protect_slice(cfs_rq, se); + if (first) + set_protect_slice(cfs_rq, se); } update_stats_curr_start(cfs_rq, se); @@ -5530,7 +5809,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr_lazy(rq_of(cfs_rq)); + resched_curr(rq_of(cfs_rq)); return; } #endif @@ -6735,27 +7014,41 @@ static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; + unsigned long scale = 1024; + unsigned long util = 0; + u64 vdelta; + u64 delta; WARN_ON_ONCE(task_rq(p) != rq); - if (rq->cfs.h_nr_queued > 1) { - u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; - u64 slice = se->slice; - s64 delta = slice - ran; + if (rq->cfs.h_nr_queued <= 1) + return; - if (delta < 0) { - if (task_current_donor(rq, p)) - resched_curr(rq); - return; - } - hrtick_start(rq, delta); + /* + * Compute time until virtual deadline + */ + vdelta = se->deadline - se->vruntime; + if ((s64)vdelta < 0) { + if (task_current_donor(rq, p)) + resched_curr(rq); + return; + } + delta = (se->load.weight * vdelta) / NICE_0_LOAD; + + /* + * Correct for instantaneous load of other classes. + */ + util += cpu_util_irq(rq); + if (util && util < 1024) { + scale *= 1024; + scale /= (1024 - util); } + + hrtick_start(rq, (scale * delta) / 1024); } /* - * called from enqueue/dequeue and updates the hrtick when the - * current task is from our class and nr_running is low enough - * to matter. + * Called on enqueue to start the hrtick when h_nr_queued becomes more than 1. */ static void hrtick_update(struct rq *rq) { @@ -6764,6 +7057,9 @@ static void hrtick_update(struct rq *rq) if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) return; + if (hrtick_active(rq)) + return; + hrtick_start_fair(rq, donor); } #else /* !CONFIG_SCHED_HRTICK: */ @@ -6779,16 +7075,15 @@ static inline void hrtick_update(struct rq *rq) static inline bool cpu_overutilized(int cpu) { - unsigned long rq_util_min, rq_util_max; + unsigned long rq_util_max; if (!sched_energy_enabled()) return false; - rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); /* Return true only if the utilization doesn't fit CPU's capacity */ - return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); + return !util_fits_cpu(cpu_util_cfs(cpu), 0, rq_util_max, cpu); } /* @@ -6826,9 +7121,15 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } -static int sched_idle_cpu(int cpu) +static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p) +{ + return sched_idle_rq(rq) && !task_has_idle_policy(p); +} + +static int choose_idle_cpu(int cpu, struct task_struct *p) { - return sched_idle_rq(cpu_rq(cpu)); + return available_idle_cpu(cpu) || + choose_sched_idle_rq(cpu_rq(cpu), p); } static void @@ -6844,18 +7145,14 @@ requeue_delayed_entity(struct sched_entity *se) WARN_ON_ONCE(!se->sched_delayed); WARN_ON_ONCE(!se->on_rq); - if (sched_feat(DELAY_ZERO)) { - update_entity_lag(cfs_rq, se); - if (se->vlag > 0) { - cfs_rq->nr_queued--; - if (se != cfs_rq->curr) - __dequeue_entity(cfs_rq, se); - se->vlag = 0; - place_entity(cfs_rq, se, 0); - if (se != cfs_rq->curr) - __enqueue_entity(cfs_rq, se); - cfs_rq->nr_queued++; - } + if (update_entity_lag(cfs_rq, se)) { + cfs_rq->nr_queued--; + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + place_entity(cfs_rq, se, 0); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + cfs_rq->nr_queued++; } update_load_avg(cfs_rq, se, 0); @@ -7086,9 +7383,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) WARN_ON_ONCE(!task_sleep); WARN_ON_ONCE(p->on_rq != 1); - /* Fix-up what dequeue_task_fair() skipped */ - hrtick_update(rq); - /* * Fix-up what block_task() skipped. * @@ -7122,8 +7416,6 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). */ - - hrtick_update(rq); return true; } @@ -7393,7 +7685,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct * if (!sched_core_cookie_match(rq, p)) continue; - if (sched_idle_cpu(i)) + if (choose_sched_idle_rq(rq, p)) return i; if (available_idle_cpu(i)) { @@ -7484,8 +7776,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas static inline int __select_idle_cpu(int cpu, struct task_struct *p) { - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && - sched_cpu_cookie_match(cpu_rq(cpu), p)) + if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p)) return cpu; return -1; @@ -7558,7 +7849,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu if (!available_idle_cpu(cpu)) { idle = false; if (*idle_cpu == -1) { - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { + if (choose_sched_idle_rq(cpu_rq(cpu), p) && + cpumask_test_cpu(cpu, cpus)) { *idle_cpu = cpu; break; } @@ -7593,7 +7885,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t */ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + if (choose_idle_cpu(cpu, p)) return cpu; } @@ -7632,21 +7924,26 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; - struct sched_domain_shared *sd_share; - - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); if (sched_feat(SIS_UTIL)) { - sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target)); - if (sd_share) { - /* because !--nr is the condition to stop scan */ - nr = READ_ONCE(sd_share->nr_idle_scan) + 1; - /* overloaded LLC is unlikely to have idle cpu/core */ - if (nr == 1) - return -1; - } + /* + * Increment because !--nr is the condition to stop scan. + * + * Since "sd" is "sd_llc" for target CPU dereferenced in the + * caller, it is safe to directly dereference "sd->shared". + * Topology bits always ensure it assigned for "sd_llc" abd it + * cannot disappear as long as we have a RCU protected + * reference to one the associated "sd" here. + */ + nr = READ_ONCE(sd->shared->nr_idle_scan) + 1; + /* overloaded LLC is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; } + if (!cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr)) + return -1; + if (static_branch_unlikely(&sched_cluster_active)) { struct sched_group *sg = sd->groups; @@ -7715,7 +8012,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + if (!choose_idle_cpu(cpu, p)) continue; fits = util_fits_cpu(task_util, util_min, util_max, cpu); @@ -7786,7 +8083,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ lockdep_assert_irqs_disabled(); - if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + if (choose_idle_cpu(target, p) && asym_fits_cpu(task_util, util_min, util_max, target)) return target; @@ -7794,7 +8091,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + choose_idle_cpu(prev, p) && asym_fits_cpu(task_util, util_min, util_max, prev)) { if (!static_branch_unlikely(&sched_cluster_active) || @@ -7826,7 +8123,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && + choose_idle_cpu(recent_used_cpu, p) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { @@ -8326,10 +8623,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct perf_domain *pd; struct energy_env eenv; - rcu_read_lock(); pd = rcu_dereference_all(rd->pd); if (!pd) - goto unlock; + return target; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -8339,13 +8635,13 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) - goto unlock; + return target; target = prev_cpu; sync_entity_load_avg(&p->se); if (!task_util_est(p) && p_util_min == 0) - goto unlock; + return target; eenv_task_busy_time(&eenv, p, prev_cpu); @@ -8440,7 +8736,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) prev_cpu); /* CPU utilization has changed */ if (prev_delta < base_energy) - goto unlock; + return target; prev_delta -= base_energy; prev_actual_cap = cpu_actual_cap; best_delta = min(best_delta, prev_delta); @@ -8464,7 +8760,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) max_spare_cap_cpu); /* CPU utilization has changed */ if (cur_delta < base_energy) - goto unlock; + return target; cur_delta -= base_energy; /* @@ -8481,7 +8777,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) best_actual_cap = cpu_actual_cap; } } - rcu_read_unlock(); if ((best_fits > prev_fits) || ((best_fits > 0) && (best_delta < prev_delta)) || @@ -8489,11 +8784,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) target = best_energy_cpu; return target; - -unlock: - rcu_read_unlock(); - - return target; } /* @@ -8538,7 +8828,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); } - rcu_read_lock(); for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, @@ -8564,14 +8853,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) break; } - if (unlikely(sd)) { - /* Slow path */ - new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); - } else if (wake_flags & WF_TTWU) { /* XXX always ? */ - /* Fast path */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - } - rcu_read_unlock(); + /* Slow path */ + if (unlikely(sd)) + return sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); + + /* Fast path */ + if (wake_flags & WF_TTWU) + return select_idle_sibling(p, prev_cpu, new_cpu); return new_cpu; } @@ -8862,8 +9150,10 @@ pick: return; preempt: - if (preempt_action == PREEMPT_WAKEUP_SHORT) + if (preempt_action == PREEMPT_WAKEUP_SHORT) { cancel_protect_slice(se); + clear_buddies(cfs_rq, se); + } resched_curr_lazy(rq); } @@ -8948,13 +9238,13 @@ again: pse = parent_entity(pse); } if (se_depth >= pse_depth) { - set_next_entity(cfs_rq_of(se), se); + set_next_entity(cfs_rq_of(se), se, true); se = parent_entity(se); } } put_prev_entity(cfs_rq, pse); - set_next_entity(cfs_rq, se); + set_next_entity(cfs_rq, se, true); __set_next_task_fair(rq, p, true); } @@ -9054,7 +9344,7 @@ static void yield_task_fair(struct rq *rq) */ if (entity_eligible(cfs_rq, se)) { se->vruntime = se->deadline; - se->deadline += calc_delta_fair(se->slice, se); + update_deadline(cfs_rq, se); } } @@ -9711,32 +10001,6 @@ next: } /* - * attach_task() -- attach the task detached by detach_task() to its new rq. - */ -static void attach_task(struct rq *rq, struct task_struct *p) -{ - lockdep_assert_rq_held(rq); - - WARN_ON_ONCE(task_rq(p) != rq); - activate_task(rq, p, ENQUEUE_NOCLOCK); - wakeup_preempt(rq, p, 0); -} - -/* - * attach_one_task() -- attaches the task returned from detach_one_task() to - * its new rq. - */ -static void attach_one_task(struct rq *rq, struct task_struct *p) -{ - struct rq_flags rf; - - rq_lock(rq, &rf); - update_rq_clock(rq); - attach_task(rq, p); - rq_unlock(rq, &rf); -} - -/* * attach_tasks() -- attaches all tasks detached by detach_tasks() to their * new rq. */ @@ -9973,6 +10237,7 @@ struct sg_lb_stats { unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned int group_smt_balance; /* Task on busy SMT be moved */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + unsigned int group_overutilized; /* At least one CPU is overutilized in the group */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -10205,6 +10470,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) static inline bool group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { + /* + * With EAS and uclamp, 1 CPU in the group must be overutilized to + * consider the group overloaded. + */ + if (sched_energy_enabled() && !sgs->group_overutilized) + return false; + if (sgs->sum_nr_running <= sgs->group_weight) return false; @@ -10388,14 +10660,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_overloaded: sched_group is overloaded - * @sg_overutilized: sched_group is overutilized */ static inline void update_sg_lb_stats(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, - bool *sg_overloaded, - bool *sg_overutilized) + bool *sg_overloaded) { int i, nr_running, local_group, sd_flags = env->sd->flags; bool balancing_at_rd = !env->sd->parent; @@ -10417,7 +10687,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_nr_running += nr_running; if (cpu_overutilized(i)) - *sg_overutilized = 1; + sgs->group_overutilized = 1; /* * No need to call idle_cpu() if nr_running is not 0 @@ -10993,6 +11263,7 @@ static void update_idle_cpu_scan(struct lb_env *env, unsigned long sum_util) { struct sched_domain_shared *sd_share; + struct sched_domain *sd = env->sd; int llc_weight, pct; u64 x, y, tmp; /* @@ -11006,11 +11277,7 @@ static void update_idle_cpu_scan(struct lb_env *env, if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) return; - llc_weight = per_cpu(sd_llc_size, env->dst_cpu); - if (env->sd->span_weight != llc_weight) - return; - - sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu)); + sd_share = sd->shared; if (!sd_share) return; @@ -11044,10 +11311,11 @@ static void update_idle_cpu_scan(struct lb_env *env, */ /* equation [3] */ x = sum_util; + llc_weight = sd->span_weight; do_div(x, llc_weight); /* equation [4] */ - pct = env->sd->imbalance_pct; + pct = sd->imbalance_pct; tmp = x * x * pct * pct; do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); @@ -11088,13 +11356,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded); if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } + sg_overutilized |= sgs->group_overutilized; + /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; @@ -12215,7 +12485,30 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su sd->newidle_success += success; if (sd->newidle_call >= 1024) { - sd->newidle_ratio = sd->newidle_success; + u64 now = sched_clock(); + s64 delta = now - sd->newidle_stamp; + sd->newidle_stamp = now; + int ratio = 0; + + if (delta < 0) + delta = 0; + + if (sched_feat(NI_RATE)) { + /* + * ratio delta freq + * + * 1024 - 4 s - 128 Hz + * 512 - 2 s - 256 Hz + * 256 - 1 s - 512 Hz + * 128 - .5 s - 1024 Hz + * 64 - .25 s - 2048 Hz + */ + ratio = delta >> 22; + } + + ratio += sd->newidle_success; + + sd->newidle_ratio = min(1024, ratio); sd->newidle_call /= 2; sd->newidle_success /= 2; } @@ -12262,7 +12555,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); + int busy = idle != CPU_IDLE && !sched_idle_rq(rq); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -12300,7 +12593,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu); - busy = !idle && !sched_idle_cpu(cpu); + busy = !idle && !sched_idle_rq(rq); } sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); @@ -12345,14 +12638,14 @@ static inline int on_null_domain(struct rq *rq) */ static inline int find_new_ilb(void) { + int this_cpu = smp_processor_id(); const struct cpumask *hk_mask; int ilb_cpu; hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { - - if (ilb_cpu == smp_processor_id()) + if (ilb_cpu == this_cpu) continue; if (idle_cpu(ilb_cpu)) @@ -12908,7 +13201,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t0 = sched_clock_cpu(this_cpu); __sched_balance_update_blocked_averages(this_rq); - this_rq->next_class = &fair_sched_class; + rq_modified_begin(this_rq, &fair_sched_class); raw_spin_rq_unlock(this_rq); for_each_domain(this_cpu, sd) { @@ -12922,7 +13215,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (sd->flags & SD_BALANCE_NEWIDLE) { unsigned int weight = 1; - if (sched_feat(NI_RANDOM)) { + if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) { /* * Throw a 1k sided dice; and only run * newidle_balance according to the success @@ -12975,7 +13268,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) pulled_task = 1; /* If a higher prio class was modified, restart the pick */ - if (sched_class_above(this_rq->next_class, &fair_sched_class)) + if (rq_modified_above(this_rq, &fair_sched_class)) pulled_task = -1; out: @@ -13365,11 +13658,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (queued) { - if (!need_resched()) - hrtick_start_fair(rq, curr); + if (queued) return; - } if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); @@ -13568,7 +13858,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - set_next_entity(cfs_rq, se); + set_next_entity(cfs_rq, se, first); /* ensure bandwidth has been allocated on our new cfs_rq */ account_cfs_rq_runtime(cfs_rq, 0); } @@ -13951,7 +14241,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; } if (ng) { - gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], + gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; } print_numa_stats(m, node, tsf, tpf, gsf, gpf); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 136a6584be79..84c4fe3abd74 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -58,13 +58,20 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(DELAY_DEQUEUE, true) SCHED_FEAT(DELAY_ZERO, true) +SCHED_FEAT(PARANOID_AVG, false) + /* * Allow wakeup-time preemption of the current task: */ SCHED_FEAT(WAKEUP_PREEMPTION, true) +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +SCHED_FEAT(HRTICK, true) +SCHED_FEAT(HRTICK_DL, true) +#else SCHED_FEAT(HRTICK, false) SCHED_FEAT(HRTICK_DL, false) +#endif /* * Decrement CPU capacity based on time not spent running tasks @@ -126,3 +133,4 @@ SCHED_FEAT(LATENCY_WARN, false) * Do newidle balancing proportional to its success rate using randomization. */ SCHED_FEAT(NI_RANDOM, true) +SCHED_FEAT(NI_RATE, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 3681b6ad9276..a83be0c834dd 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -161,6 +161,14 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, return cpuidle_enter(drv, dev, next_state); } +static void idle_call_stop_or_retain_tick(bool stop_tick) +{ + if (stop_tick || tick_nohz_tick_stopped()) + tick_nohz_idle_stop_tick(); + else + tick_nohz_idle_retain_tick(); +} + /** * cpuidle_idle_call - the main idle function * @@ -170,7 +178,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, * set, and it returns with polling set. If it ever stops polling, it * must clear the polling bit. */ -static void cpuidle_idle_call(void) +static void cpuidle_idle_call(bool stop_tick) { struct cpuidle_device *dev = cpuidle_get_device(); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); @@ -186,7 +194,7 @@ static void cpuidle_idle_call(void) } if (cpuidle_not_available(drv, dev)) { - tick_nohz_idle_stop_tick(); + idle_call_stop_or_retain_tick(stop_tick); default_idle_call(); goto exit_idle; @@ -221,24 +229,35 @@ static void cpuidle_idle_call(void) next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); - } else { - bool stop_tick = true; + } else if (drv->state_count > 1) { + /* + * stop_tick is expected to be true by default by cpuidle + * governors, which allows them to select idle states with + * target residency above the tick period length. + */ + stop_tick = true; /* * Ask the cpuidle framework to choose a convenient idle state. */ next_state = cpuidle_select(drv, dev, &stop_tick); - if (stop_tick || tick_nohz_tick_stopped()) - tick_nohz_idle_stop_tick(); - else - tick_nohz_idle_retain_tick(); + idle_call_stop_or_retain_tick(stop_tick); entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome */ cpuidle_reflect(dev, entered_state); + } else { + idle_call_stop_or_retain_tick(stop_tick); + + /* + * If there is only a single idle state (or none), there is + * nothing meaningful for the governor to choose. Skip the + * governor and always use state 0. + */ + call_cpuidle(drv, dev, 0); } exit_idle: @@ -259,6 +278,7 @@ exit_idle: static void do_idle(void) { int cpu = smp_processor_id(); + bool got_tick = false; /* * Check if we need to update blocked load @@ -329,8 +349,9 @@ static void do_idle(void) tick_nohz_idle_restart_tick(); cpu_idle_poll(); } else { - cpuidle_idle_call(); + cpuidle_idle_call(got_tick); } + got_tick = tick_nohz_idle_got_tick(); arch_cpu_idle_exit(); } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 3b725d39c06e..ef152d401fe2 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -123,8 +123,6 @@ int housekeeping_update(struct cpumask *isol_mask) struct cpumask *trial, *old = NULL; int err; - lockdep_assert_cpus_held(); - trial = kmalloc(cpumask_size(), GFP_KERNEL); if (!trial) return -ENOMEM; @@ -136,7 +134,7 @@ int housekeeping_update(struct cpumask *isol_mask) } if (!housekeeping.flags) - static_branch_enable_cpuslocked(&housekeeping_overridden); + static_branch_enable(&housekeeping_overridden); if (housekeeping.flags & HK_FLAG_DOMAIN) old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f69e1f16d923..4ee8faf01441 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1302,13 +1302,18 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int flags) { struct task_struct *p = NULL; + struct rq *rq = rq_of_rt_rq(rt_rq); if (!schedstat_enabled()) return; - if (rt_entity_is_task(rt_se)) + if (rt_entity_is_task(rt_se)) { p = rt_task_of(rt_se); + if (p != rq->curr) + update_stats_wait_end_rt(rt_rq, rt_se); + } + if ((flags & DEQUEUE_SLEEP) && p) { unsigned int state; @@ -1853,13 +1858,22 @@ static int find_lowest_rq(struct task_struct *task) static struct task_struct *pick_next_pushable_task(struct rq *rq) { - struct task_struct *p; + struct plist_head *head = &rq->rt.pushable_tasks; + struct task_struct *i, *p = NULL; if (!has_pushable_tasks(rq)) return NULL; - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); + plist_for_each_entry(i, head, pushable_tasks) { + /* make sure task isn't on_cpu (possible with proxy-exec) */ + if (!task_on_cpu(rq, i)) { + p = i; + break; + } + } + + if (!p) + return NULL; BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); @@ -2652,7 +2666,7 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) { struct rt_schedulable_data *d = data; struct task_group *child; - unsigned long total, sum = 0; + u64 total, sum = 0; u64 period, runtime; period = ktime_to_ns(tg->rt_bandwidth.rt_period); @@ -2676,9 +2690,6 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) return -EBUSY; - if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group)) - return -EBUSY; - total = to_ratio(period, runtime); /* @@ -2818,19 +2829,6 @@ long sched_group_rt_period(struct task_group *tg) return rt_period_us; } -#ifdef CONFIG_SYSCTL -static int sched_rt_global_constraints(void) -{ - int ret = 0; - - mutex_lock(&rt_constraints_mutex); - ret = __rt_schedulable(NULL, 0, 0); - mutex_unlock(&rt_constraints_mutex); - - return ret; -} -#endif /* CONFIG_SYSCTL */ - int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { /* Don't accept real-time tasks when there is no way for them to run */ @@ -2840,14 +2838,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) return 1; } -#else /* !CONFIG_RT_GROUP_SCHED: */ - -#ifdef CONFIG_SYSCTL -static int sched_rt_global_constraints(void) -{ - return 0; -} -#endif /* CONFIG_SYSCTL */ #endif /* !CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SYSCTL @@ -2859,11 +2849,14 @@ static int sched_rt_global_validate(void) NSEC_PER_USEC > max_rt_runtime))) return -EINVAL; - return 0; -} +#ifdef CONFIG_RT_GROUP_SCHED + if (!rt_group_sched_enabled()) + return 0; -static void sched_rt_do_global(void) -{ + scoped_guard(mutex, &rt_constraints_mutex) + return __rt_schedulable(NULL, 0, 0); +#endif + return 0; } static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, @@ -2889,11 +2882,6 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff if (ret) goto undo; - ret = sched_rt_global_constraints(); - if (ret) - goto undo; - - sched_rt_do_global(); sched_dl_do_global(); } if (0) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b82fb70a9d54..9f63b15d309d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -356,7 +356,7 @@ extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); -extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); @@ -684,8 +684,9 @@ struct cfs_rq { s64 sum_w_vruntime; u64 sum_weight; - u64 zero_vruntime; + unsigned int sum_shift; + #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; u64 zero_vruntime_fi; @@ -782,7 +783,6 @@ enum scx_rq_flags { SCX_RQ_ONLINE = 1 << 0, SCX_RQ_CAN_STOP_TICK = 1 << 1, SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ - SCX_RQ_BYPASSING = 1 << 4, SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ SCX_RQ_BAL_CB_PENDING = 1 << 6, /* must queue a cb after dispatching */ @@ -798,19 +798,29 @@ struct scx_rq { u64 extra_enq_flags; /* see move_task_to_local_dsq() */ u32 nr_running; u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ + bool in_select_cpu; bool cpu_released; u32 flags; + u32 nr_immed; /* ENQ_IMMED tasks on local_dsq */ u64 clock; /* current per-rq clock -- see scx_bpf_now() */ cpumask_var_t cpus_to_kick; cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_preempt; cpumask_var_t cpus_to_wait; + cpumask_var_t cpus_to_sync; + bool kick_sync_pending; unsigned long kick_sync; - local_t reenq_local_deferred; + + struct task_struct *sub_dispatch_prev; + + raw_spinlock_t deferred_reenq_lock; + u64 deferred_reenq_locals_seq; + struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */ + struct list_head deferred_reenq_users; /* user DSQs requesting reenq */ struct balance_callback deferred_bal_cb; + struct balance_callback kick_sync_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; - struct scx_dispatch_q bypass_dsq; }; #endif /* CONFIG_SCHED_CLASS_EXT */ @@ -1285,6 +1295,8 @@ struct rq { call_single_data_t hrtick_csd; struct hrtimer hrtick_timer; ktime_t hrtick_time; + ktime_t hrtick_delay; + unsigned int hrtick_sched; #endif #ifdef CONFIG_SCHEDSTATS @@ -1606,15 +1618,18 @@ extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass) extern bool raw_spin_rq_trylock(struct rq *rq) __cond_acquires(true, __rq_lockp(rq)); -extern void raw_spin_rq_unlock(struct rq *rq) - __releases(__rq_lockp(rq)); - static inline void raw_spin_rq_lock(struct rq *rq) __acquires(__rq_lockp(rq)) { raw_spin_rq_lock_nested(rq, 0); } +static inline void raw_spin_rq_unlock(struct rq *rq) + __releases(__rq_lockp(rq)) +{ + raw_spin_unlock(rq_lockp(rq)); +} + static inline void raw_spin_rq_lock_irq(struct rq *rq) __acquires(__rq_lockp(rq)) { @@ -1853,6 +1868,13 @@ static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {} static inline void scx_rq_clock_invalidate(struct rq *rq) {} #endif /* !CONFIG_SCHED_CLASS_EXT */ +static inline void assert_balance_callbacks_empty(struct rq *rq) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_LOCKING) && + rq->balance_callback && + rq->balance_callback != &balance_push_callback); +} + /* * Lockdep annotation that avoids accidental unlocks; it's like a * sticky/continuous lockdep_assert_held(). @@ -1869,7 +1891,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; - WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); + assert_balance_callbacks_empty(rq); } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) @@ -2748,6 +2770,17 @@ static inline const struct sched_class *next_active_class(const struct sched_cla #define sched_class_above(_a, _b) ((_a) < (_b)) +static inline void rq_modified_begin(struct rq *rq, const struct sched_class *class) +{ + if (sched_class_above(rq->next_class, class)) + rq->next_class = class; +} + +static inline bool rq_modified_above(struct rq *rq, const struct sched_class *class) +{ + return sched_class_above(rq->next_class, class); +} + static inline bool sched_stop_runnable(struct rq *rq) { return rq->stop && task_on_rq_queued(rq->stop); @@ -2838,7 +2871,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { - WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert(rcu_read_lock_any_held()); return rq->idle_state; } @@ -2885,7 +2918,7 @@ extern void init_cfs_throttle_work(struct task_struct *p); #define MAX_BW_BITS (64 - BW_SHIFT) #define MAX_BW ((1ULL << MAX_BW_BITS) - 1) -extern unsigned long to_ratio(u64 period, u64 runtime); +extern u64 to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); extern void post_init_entity_util_avg(struct task_struct *p); @@ -2990,6 +3023,29 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); +/* + * attach_task() -- attach the task detached by detach_task() to its new rq. + */ +static inline void attach_task(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + + WARN_ON_ONCE(task_rq(p) != rq); + activate_task(rq, p, ENQUEUE_NOCLOCK); + wakeup_preempt(rq, p, 0); +} + +/* + * attach_one_task() -- attaches the task returned from detach_one_task() to + * its new rq. + */ +static inline void attach_one_task(struct rq *rq, struct task_struct *p) +{ + guard(rq_lock)(rq); + update_rq_clock(rq); + attach_task(rq, p); +} + #ifdef CONFIG_PREEMPT_RT # define SCHED_NR_MIGRATE_BREAK 8 #else @@ -3019,46 +3075,31 @@ extern unsigned int sysctl_numa_balancing_hot_threshold; * - enabled by features * - hrtimer is actually high res */ -static inline int hrtick_enabled(struct rq *rq) +static inline bool hrtick_enabled(struct rq *rq) { - if (!cpu_active(cpu_of(rq))) - return 0; - return hrtimer_is_hres_active(&rq->hrtick_timer); + return cpu_active(cpu_of(rq)) && hrtimer_highres_enabled(); } -static inline int hrtick_enabled_fair(struct rq *rq) +static inline bool hrtick_enabled_fair(struct rq *rq) { - if (!sched_feat(HRTICK)) - return 0; - return hrtick_enabled(rq); + return sched_feat(HRTICK) && hrtick_enabled(rq); } -static inline int hrtick_enabled_dl(struct rq *rq) +static inline bool hrtick_enabled_dl(struct rq *rq) { - if (!sched_feat(HRTICK_DL)) - return 0; - return hrtick_enabled(rq); + return sched_feat(HRTICK_DL) && hrtick_enabled(rq); } extern void hrtick_start(struct rq *rq, u64 delay); - -#else /* !CONFIG_SCHED_HRTICK: */ - -static inline int hrtick_enabled_fair(struct rq *rq) -{ - return 0; -} - -static inline int hrtick_enabled_dl(struct rq *rq) +static inline bool hrtick_active(struct rq *rq) { - return 0; -} - -static inline int hrtick_enabled(struct rq *rq) -{ - return 0; + return hrtimer_active(&rq->hrtick_timer); } +#else /* !CONFIG_SCHED_HRTICK: */ +static inline bool hrtick_enabled_fair(struct rq *rq) { return false; } +static inline bool hrtick_enabled_dl(struct rq *rq) { return false; } +static inline bool hrtick_enabled(struct rq *rq) { return false; } #endif /* !CONFIG_SCHED_HRTICK */ #ifndef arch_scale_freq_tick diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 6f10db3646e7..b215b0ead9a6 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -284,6 +284,35 @@ static bool check_same_owner(struct task_struct *p) uid_eq(cred->euid, pcred->uid)); } +#ifdef CONFIG_RT_MUTEXES +static inline void __setscheduler_dl_pi(int newprio, int policy, + struct task_struct *p, + struct sched_change_ctx *scope) +{ + /* + * In case a DEADLINE task (either proper or boosted) gets + * setscheduled to a lower priority class, check if it neeeds to + * inherit parameters from a potential pi_task. In that case make + * sure replenishment happens with the next enqueue. + */ + + if (dl_prio(newprio) && !dl_policy(policy)) { + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + if (pi_task) { + p->dl.pi_se = pi_task->dl.pi_se; + scope->flags |= ENQUEUE_REPLENISH; + } + } +} +#else /* !CONFIG_RT_MUTEXES */ +static inline void __setscheduler_dl_pi(int newprio, int policy, + struct task_struct *p, + struct sched_change_ctx *scope) +{ +} +#endif /* !CONFIG_RT_MUTEXES */ + #ifdef CONFIG_UCLAMP_TASK static int uclamp_validate(struct task_struct *p, @@ -655,6 +684,7 @@ change: __setscheduler_params(p, attr); p->sched_class = next_class; p->prio = newprio; + __setscheduler_dl_pi(newprio, policy, p, scope); } __setscheduler_uclamp(p, attr); @@ -881,10 +911,10 @@ err_size: return -E2BIG; } -static void get_params(struct task_struct *p, struct sched_attr *attr) +static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags) { if (task_has_dl_policy(p)) { - __getparam_dl(p, attr); + __getparam_dl(p, attr, flags); } else if (task_has_rt_policy(p)) { attr->sched_priority = p->rt_priority; } else { @@ -950,7 +980,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, return -ESRCH; if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) - get_params(p, &attr); + get_params(p, &attr, 0); return sched_setattr(p, &attr); } @@ -1035,7 +1065,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, int retval; if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags)) + usize < SCHED_ATTR_SIZE_VER0)) return -EINVAL; scoped_guard (rcu) { @@ -1043,6 +1073,12 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, if (!p) return -ESRCH; + if (flags) { + if (!task_has_dl_policy(p) || + flags != SCHED_GETATTR_FLAG_DL_DYNAMIC) + return -EINVAL; + } + retval = security_task_getscheduler(p); if (retval) return retval; @@ -1050,7 +1086,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - get_params(p, &kattr); + get_params(p, &kattr, flags); kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 32dcddaead82..5847b83d9d55 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -4,6 +4,7 @@ */ #include <linux/sched/isolation.h> +#include <linux/sched/clock.h> #include <linux/bsearch.h> #include "sched.h" @@ -272,7 +273,7 @@ void rebuild_sched_domains_energy(void) static int sched_energy_aware_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret, state; + int ret; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -288,8 +289,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { - state = static_branch_unlikely(&sched_energy_present); - if (state != sysctl_sched_energy_aware) + if (sysctl_sched_energy_aware != sched_energy_enabled()) rebuild_sched_domains_energy(); } @@ -387,11 +387,11 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) static void sched_energy_set(bool has_eas) { - if (!has_eas && static_branch_unlikely(&sched_energy_present)) { + if (!has_eas && sched_energy_enabled()) { if (sched_debug()) pr_info("%s: stopping EAS\n", __func__); static_branch_disable_cpuslocked(&sched_energy_present); - } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) { + } else if (has_eas && !sched_energy_enabled()) { if (sched_debug()) pr_info("%s: starting EAS\n", __func__); static_branch_enable_cpuslocked(&sched_energy_present); @@ -684,6 +684,9 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); + + /* If sd_llc exists, sd_llc_shared should exist too. */ + WARN_ON_ONCE(!sd->shared); sds = sd->shared; } @@ -732,6 +735,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; + /* Pick reference to parent->shared. */ + if (parent->shared) { + WARN_ON_ONCE(tmp->shared); + tmp->shared = parent->shared; + parent->shared = NULL; + } + if (parent->parent) { parent->parent->child = tmp; parent->parent->groups->flags = tmp->flags; @@ -781,6 +791,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } struct s_data { + struct sched_domain_shared * __percpu *sds; struct sched_domain * __percpu *sd; struct root_domain *rd; }; @@ -788,6 +799,7 @@ struct s_data { enum s_alloc { sa_rootdomain, sa_sd, + sa_sd_shared, sa_sd_storage, sa_none, }; @@ -1534,6 +1546,9 @@ static void set_domain_attribute(struct sched_domain *sd, static void __sdt_free(const struct cpumask *cpu_map); static int __sdt_alloc(const struct cpumask *cpu_map); +static void __sds_free(struct s_data *d, const struct cpumask *cpu_map); +static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map); + static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) { @@ -1545,6 +1560,9 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, case sa_sd: free_percpu(d->sd); fallthrough; + case sa_sd_shared: + __sds_free(d, cpu_map); + fallthrough; case sa_sd_storage: __sdt_free(cpu_map); fallthrough; @@ -1560,9 +1578,11 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) if (__sdt_alloc(cpu_map)) return sa_sd_storage; + if (__sds_alloc(d, cpu_map)) + return sa_sd_shared; d->sd = alloc_percpu(struct sched_domain *); if (!d->sd) - return sa_sd_storage; + return sa_sd_shared; d->rd = alloc_rootdomain(); if (!d->rd) return sa_sd; @@ -1575,21 +1595,25 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) * sched_group structure so that the subsequent __free_domain_allocs() * will not free the data we're using. */ -static void claim_allocations(int cpu, struct sched_domain *sd) +static void claim_allocations(int cpu, struct s_data *d) { - struct sd_data *sdd = sd->private; + struct sched_domain *sd; + + if (atomic_read(&(*per_cpu_ptr(d->sds, cpu))->ref)) + *per_cpu_ptr(d->sds, cpu) = NULL; - WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); - *per_cpu_ptr(sdd->sd, cpu) = NULL; + for (sd = *per_cpu_ptr(d->sd, cpu); sd; sd = sd->parent) { + struct sd_data *sdd = sd->private; - if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) - *per_cpu_ptr(sdd->sds, cpu) = NULL; + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); + *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) - *per_cpu_ptr(sdd->sg, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) + *per_cpu_ptr(sdd->sg, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) - *per_cpu_ptr(sdd->sgc, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) + *per_cpu_ptr(sdd->sgc, cpu) = NULL; + } } #ifdef CONFIG_NUMA @@ -1642,14 +1666,19 @@ sd_init(struct sched_domain_topology_level *tl, struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; + u64 now = sched_clock(); - sd_weight = cpumask_weight(tl->mask(tl, cpu)); + sd_span = sched_domain_span(sd); + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); + sd_weight = cpumask_weight(sd_span); + sd_id = cpumask_first(sd_span); if (tl->sd_flags) sd_flags = (*tl->sd_flags)(); if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, - "wrong sd_flags in topology description\n")) + "wrong sd_flags in topology description\n")) sd_flags &= TOPOLOGY_SD_FLAGS; + sd_flags |= asym_cpu_capacity_classify(sd_span, cpu_map); *sd = (struct sched_domain){ .min_interval = sd_weight, @@ -1679,6 +1708,7 @@ sd_init(struct sched_domain_topology_level *tl, .newidle_call = 512, .newidle_success = 256, .newidle_ratio = 512, + .newidle_stamp = now, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, @@ -1686,12 +1716,6 @@ sd_init(struct sched_domain_topology_level *tl, .name = tl->name, }; - sd_span = sched_domain_span(sd); - cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); - sd_id = cpumask_first(sd_span); - - sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); - WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) == (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY), "CPU capacity asymmetry not supported on SMT\n"); @@ -1727,16 +1751,6 @@ sd_init(struct sched_domain_topology_level *tl, sd->cache_nice_tries = 1; } - /* - * For all levels sharing cache; connect a sched_domain_shared - * instance. - */ - if (sd->flags & SD_SHARE_LLC) { - sd->shared = *per_cpu_ptr(sdd->sds, sd_id); - atomic_inc(&sd->shared->ref); - atomic_set(&sd->shared->nr_busy_cpus, sd_weight); - } - sd->private = sdd; return sd; @@ -2372,10 +2386,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sd) return -ENOMEM; - sdd->sds = alloc_percpu(struct sched_domain_shared *); - if (!sdd->sds) - return -ENOMEM; - sdd->sg = alloc_percpu(struct sched_group *); if (!sdd->sg) return -ENOMEM; @@ -2386,7 +2396,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { struct sched_domain *sd; - struct sched_domain_shared *sds; struct sched_group *sg; struct sched_group_capacity *sgc; @@ -2397,13 +2406,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sd, j) = sd; - sds = kzalloc_node(sizeof(struct sched_domain_shared), - GFP_KERNEL, cpu_to_node(j)); - if (!sds) - return -ENOMEM; - - *per_cpu_ptr(sdd->sds, j) = sds; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sg) @@ -2445,8 +2447,6 @@ static void __sdt_free(const struct cpumask *cpu_map) kfree(*per_cpu_ptr(sdd->sd, j)); } - if (sdd->sds) - kfree(*per_cpu_ptr(sdd->sds, j)); if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); if (sdd->sgc) @@ -2454,8 +2454,6 @@ static void __sdt_free(const struct cpumask *cpu_map) } free_percpu(sdd->sd); sdd->sd = NULL; - free_percpu(sdd->sds); - sdd->sds = NULL; free_percpu(sdd->sg); sdd->sg = NULL; free_percpu(sdd->sgc); @@ -2463,6 +2461,42 @@ static void __sdt_free(const struct cpumask *cpu_map) } } +static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map) +{ + int j; + + d->sds = alloc_percpu(struct sched_domain_shared *); + if (!d->sds) + return -ENOMEM; + + for_each_cpu(j, cpu_map) { + struct sched_domain_shared *sds; + + sds = kzalloc_node(sizeof(struct sched_domain_shared), + GFP_KERNEL, cpu_to_node(j)); + if (!sds) + return -ENOMEM; + + *per_cpu_ptr(d->sds, j) = sds; + } + + return 0; +} + +static void __sds_free(struct s_data *d, const struct cpumask *cpu_map) +{ + int j; + + if (!d->sds) + return; + + for_each_cpu(j, cpu_map) + kfree(*per_cpu_ptr(d->sds, j)); + + free_percpu(d->sds); + d->sds = NULL; +} + static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) @@ -2549,6 +2583,74 @@ static bool topology_span_sane(const struct cpumask *cpu_map) } /* + * Calculate an allowed NUMA imbalance such that LLCs do not get + * imbalanced. + */ +static void adjust_numa_imbalance(struct sched_domain *sd_llc) +{ + struct sched_domain *parent; + unsigned int imb_span = 1; + unsigned int imb = 0; + unsigned int nr_llcs; + + WARN_ON(!(sd_llc->flags & SD_SHARE_LLC)); + WARN_ON(!sd_llc->parent); + + /* + * For a single LLC per node, allow an + * imbalance up to 12.5% of the node. This is + * arbitrary cutoff based two factors -- SMT and + * memory channels. For SMT-2, the intent is to + * avoid premature sharing of HT resources but + * SMT-4 or SMT-8 *may* benefit from a different + * cutoff. For memory channels, this is a very + * rough estimate of how many channels may be + * active and is based on recent CPUs with + * many cores. + * + * For multiple LLCs, allow an imbalance + * until multiple tasks would share an LLC + * on one node while LLCs on another node + * remain idle. This assumes that there are + * enough logical CPUs per LLC to avoid SMT + * factors and that there is a correlation + * between LLCs and memory channels. + */ + nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight; + if (nr_llcs == 1) + imb = sd_llc->parent->span_weight >> 3; + else + imb = nr_llcs; + + imb = max(1U, imb); + sd_llc->parent->imb_numa_nr = imb; + + /* + * Set span based on the first NUMA domain. + * + * NUMA systems always add a NODE domain before + * iterating the NUMA domains. Since this is before + * degeneration, start from sd_llc's parent's + * parent which is the lowest an SD_NUMA domain can + * be relative to sd_llc. + */ + parent = sd_llc->parent->parent; + while (parent && !(parent->flags & SD_NUMA)) + parent = parent->parent; + + imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight; + + /* Update the upper remainder of the topology */ + parent = sd_llc->parent; + while (parent) { + int factor = max(1U, (parent->span_weight / imb_span)); + + parent->imb_numa_nr = imb * factor; + parent = parent->parent; + } +} + +/* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs */ @@ -2605,61 +2707,28 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } - /* - * Calculate an allowed NUMA imbalance such that LLCs do not get - * imbalanced. - */ for_each_cpu(i, cpu_map) { - unsigned int imb = 0; - unsigned int imb_span = 1; + sd = *per_cpu_ptr(d.sd, i); + if (!sd) + continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { - struct sched_domain *child = sd->child; - - if (!(sd->flags & SD_SHARE_LLC) && child && - (child->flags & SD_SHARE_LLC)) { - struct sched_domain __rcu *top_p; - unsigned int nr_llcs; - - /* - * For a single LLC per node, allow an - * imbalance up to 12.5% of the node. This is - * arbitrary cutoff based two factors -- SMT and - * memory channels. For SMT-2, the intent is to - * avoid premature sharing of HT resources but - * SMT-4 or SMT-8 *may* benefit from a different - * cutoff. For memory channels, this is a very - * rough estimate of how many channels may be - * active and is based on recent CPUs with - * many cores. - * - * For multiple LLCs, allow an imbalance - * until multiple tasks would share an LLC - * on one node while LLCs on another node - * remain idle. This assumes that there are - * enough logical CPUs per LLC to avoid SMT - * factors and that there is a correlation - * between LLCs and memory channels. - */ - nr_llcs = sd->span_weight / child->span_weight; - if (nr_llcs == 1) - imb = sd->span_weight >> 3; - else - imb = nr_llcs; - imb = max(1U, imb); - sd->imb_numa_nr = imb; - - /* Set span based on the first NUMA domain. */ - top_p = sd->parent; - while (top_p && !(top_p->flags & SD_NUMA)) { - top_p = top_p->parent; - } - imb_span = top_p ? top_p->span_weight : sd->span_weight; - } else { - int factor = max(1U, (sd->span_weight / imb_span)); + /* First, find the topmost SD_SHARE_LLC domain */ + while (sd->parent && (sd->parent->flags & SD_SHARE_LLC)) + sd = sd->parent; - sd->imb_numa_nr = imb * factor; - } + if (sd->flags & SD_SHARE_LLC) { + int sd_id = cpumask_first(sched_domain_span(sd)); + + sd->shared = *per_cpu_ptr(d.sds, sd_id); + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight); + atomic_inc(&sd->shared->ref); + + /* + * In presence of higher domains, adjust the + * NUMA imbalance stats for the hierarchy. + */ + if (IS_ENABLED(CONFIG_NUMA) && sd->parent) + adjust_numa_imbalance(sd); } } @@ -2668,10 +2737,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (!cpumask_test_cpu(i, cpu_map)) continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { - claim_allocations(i, sd); + claim_allocations(i, &d); + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) init_sched_groups_capacity(i, sd); - } } /* Attach the domains */ diff --git a/kernel/signal.c b/kernel/signal.c index d65d0fe24bfb..2d102e025883 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1000,9 +1000,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) * Found a killable thread. If the signal will be fatal, * then start taking the whole group down immediately. */ - if (sig_fatal(p, sig) && - (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) && - !sigismember(&t->real_blocked, sig) && + if (sig_fatal(p, sig) && !sigismember(&t->real_blocked, sig) && (sig == SIGKILL || !p->ptrace)) { /* * This signal will be fatal to the whole group. @@ -2173,13 +2171,13 @@ bool do_notify_parent(struct task_struct *tsk, int sig) bool autoreap = false; u64 utime, stime; - WARN_ON_ONCE(sig == -1); + if (WARN_ON_ONCE(!valid_signal(sig))) + return false; /* do_notify_parent_cldstop should have been called instead. */ WARN_ON_ONCE(task_is_stopped_or_traced(tsk)); - WARN_ON_ONCE(!tsk->ptrace && - (tsk->group_leader != tsk || !thread_group_empty(tsk))); + WARN_ON_ONCE(!tsk->ptrace && !thread_group_empty(tsk)); /* ptraced, or group-leader without sub-threads */ do_notify_pidfd(tsk); @@ -2251,11 +2249,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig) if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) sig = 0; } + if (!tsk->ptrace && tsk->signal->autoreap) { + autoreap = true; + sig = 0; + } /* * Send with __send_signal as si_pid and si_uid are in the * parent's namespaces. */ - if (valid_signal(sig) && sig) + if (sig) __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); @@ -2814,8 +2816,9 @@ bool get_signal(struct ksignal *ksig) /* * Do this once, we can't return to user-mode if freezing() == T. - * do_signal_stop() and ptrace_stop() do freezable_schedule() and - * thus do not need another check after return. + * do_signal_stop() and ptrace_stop() set TASK_STOPPED/TASK_TRACED + * and the freezer handles those states via TASK_FROZEN, thus they + * do not need another check after return. */ try_to_freeze(); diff --git a/kernel/smp.c b/kernel/smp.c index f349960f79ca..a0bb56bd8dda 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -215,7 +215,7 @@ static atomic_t n_csd_lock_stuck; /** * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long? * - * Returns @true if a CSD-lock acquisition is stuck and has been stuck + * Returns: @true if a CSD-lock acquisition is stuck and has been stuck * long enough for a "non-responsive CSD lock" message to be printed. */ bool csd_lock_is_stuck(void) @@ -377,6 +377,20 @@ static __always_inline void csd_unlock(call_single_data_t *csd) static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG +static call_single_data_t *get_single_csd_data(int cpu) +{ + if (static_branch_unlikely(&csdlock_debug_enabled)) + return per_cpu_ptr(&csd_data, cpu); + return this_cpu_ptr(&csd_data); +} +#else +static call_single_data_t *get_single_csd_data(int cpu) +{ + return this_cpu_ptr(&csd_data); +} +#endif + void __smp_call_single_queue(int cpu, struct llist_node *node) { /* @@ -394,7 +408,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) func = CSD_TYPE(csd) == CSD_TYPE_TTWU ? sched_ttwu_pending : csd->func; - trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); + trace_call__csd_queue_cpu(cpu, _RET_IP_, func, csd); } /* @@ -625,13 +639,14 @@ void flush_smp_call_function_queue(void) local_irq_restore(flags); } -/* +/** * smp_call_function_single - Run a function on a specific CPU + * @cpu: Specific target CPU for this function. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed on other CPUs. * - * Returns 0 on success, else a negative status code. + * Returns: %0 on success, else a negative status code. */ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) @@ -670,14 +685,14 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd = &csd_stack; if (!wait) { - csd = this_cpu_ptr(&csd_data); + csd = get_single_csd_data(cpu); csd_lock(csd); } csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG - csd->node.src = smp_processor_id(); + csd->node.src = this_cpu; csd->node.dst = cpu; #endif @@ -738,18 +753,18 @@ out: } EXPORT_SYMBOL_GPL(smp_call_function_single_async); -/* +/** * smp_call_function_any - Run a function on any of the given cpus * @mask: The mask of cpus it can run on. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed. * - * Returns 0 on success, else a negative status code (if no cpus were online). - * * Selection preference: * 1) current cpu if in @mask * 2) nearest cpu in @mask, based on NUMA topology + * + * Returns: %0 on success, else a negative status code (if no cpus were online). */ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) @@ -832,7 +847,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG - csd->node.src = smp_processor_id(); + csd->node.src = this_cpu; csd->node.dst = cpu; #endif trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); @@ -880,7 +895,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, } /** - * smp_call_function_many(): Run a function on a set of CPUs. + * smp_call_function_many() - Run a function on a set of CPUs. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. @@ -902,14 +917,12 @@ void smp_call_function_many(const struct cpumask *mask, EXPORT_SYMBOL(smp_call_function_many); /** - * smp_call_function(): Run a function on all other CPUs. + * smp_call_function() - Run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * - * Returns 0. - * * If @wait is true, then returns once @func has returned; otherwise * it returns just before the target cpu calls @func. * @@ -1009,8 +1022,8 @@ void __init smp_init(void) smp_cpus_done(setup_max_cpus); } -/* - * on_each_cpu_cond(): Call a function on each processor for which +/** + * on_each_cpu_cond_mask() - Call a function on each processor for which * the supplied function cond_func returns true, optionally waiting * for all the required CPUs to finish. This may include the local * processor. @@ -1024,6 +1037,7 @@ void __init smp_init(void) * @info: An arbitrary pointer to pass to both functions. * @wait: If true, wait (atomically) until function has * completed on other CPUs. + * @mask: The set of cpus to run on (only runs on online subset). * * Preemption is disabled to protect against CPUs going offline but not online. * CPUs going online during the call will not be seen or sent an IPI. @@ -1095,7 +1109,7 @@ EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); * scheduled, for any of the CPUs in the @mask. It does not guarantee * correctness as it only provides a racy snapshot. * - * Returns true if there is a pending IPI scheduled and false otherwise. + * Returns: true if there is a pending IPI scheduled and false otherwise. */ bool cpus_peek_for_pending_ipi(const struct cpumask *mask) { @@ -1145,6 +1159,18 @@ static void smp_call_on_cpu_callback(struct work_struct *work) complete(&sscs->done); } +/** + * smp_call_on_cpu() - Call a function on a specific CPU and wait + * for it to return. + * @cpu: The CPU to run on. + * @func: The function to run + * @par: An arbitrary pointer parameter for @func. + * @phys: If @true, force to run on physical @cpu. See + * &struct smp_call_on_cpu_struct for more info. + * + * Returns: %-ENXIO if the @cpu is invalid; otherwise the return value + * from @func. + */ int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) { struct smp_call_on_cpu_struct sscs = { @@ -1159,7 +1185,7 @@ int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; - queue_work_on(cpu, system_wq, &sscs.work); + queue_work_on(cpu, system_percpu_wq, &sscs.work); wait_for_completion(&sscs.done); destroy_work_on_stack(&sscs.work); diff --git a/kernel/softirq.c b/kernel/softirq.c index 77198911b8dd..4425d8dce44b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -663,6 +663,13 @@ void irq_enter_rcu(void) { __irq_enter_raw(); + /* + * If this is a nested interrupt that hits the exit_to_user_mode_loop + * where it has enabled interrupts but before it has hit schedule() we + * could have hrtimers in an undefined state. Fix it up here. + */ + hrtimer_rearm_deferred(); + if (tick_nohz_full_cpu(smp_processor_id()) || (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET))) tick_irq_enter(); @@ -719,8 +726,14 @@ static inline void __irq_exit_rcu(void) #endif account_hardirq_exit(current); preempt_count_sub(HARDIRQ_OFFSET); - if (!in_interrupt() && local_softirq_pending()) + if (!in_interrupt() && local_softirq_pending()) { + /* + * If we left hrtimers unarmed, make sure to arm them now, + * before enabling interrupts to run SoftIRQ. + */ + hrtimer_rearm_deferred(); invoke_softirq(); + } if (IS_ENABLED(CONFIG_IRQ_FORCED_THREADING) && force_irqthreads() && local_timers_pending_force_th() && !(in_nmi() | in_hardirq())) diff --git a/kernel/sys.c b/kernel/sys.c index c86eba9aa7e9..62e842055cc9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2388,17 +2388,18 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st return -EINVAL; } -int __weak arch_get_indir_br_lp_status(struct task_struct *t, unsigned long __user *status) +int __weak arch_prctl_get_branch_landing_pad_state(struct task_struct *t, + unsigned long __user *state) { return -EINVAL; } -int __weak arch_set_indir_br_lp_status(struct task_struct *t, unsigned long status) +int __weak arch_prctl_set_branch_landing_pad_state(struct task_struct *t, unsigned long state) { return -EINVAL; } -int __weak arch_lock_indir_br_lp_status(struct task_struct *t, unsigned long status) +int __weak arch_prctl_lock_branch_landing_pad_state(struct task_struct *t) { return -EINVAL; } @@ -2888,20 +2889,23 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = rseq_slice_extension_prctl(arg2, arg3); break; - case PR_GET_INDIR_BR_LP_STATUS: - if (arg3 || arg4 || arg5) + case PR_GET_CFI: + if (arg2 != PR_CFI_BRANCH_LANDING_PADS) return -EINVAL; - error = arch_get_indir_br_lp_status(me, (unsigned long __user *)arg2); - break; - case PR_SET_INDIR_BR_LP_STATUS: - if (arg3 || arg4 || arg5) + if (arg4 || arg5) return -EINVAL; - error = arch_set_indir_br_lp_status(me, arg2); + error = arch_prctl_get_branch_landing_pad_state(me, (unsigned long __user *)arg3); break; - case PR_LOCK_INDIR_BR_LP_STATUS: - if (arg3 || arg4 || arg5) + case PR_SET_CFI: + if (arg2 != PR_CFI_BRANCH_LANDING_PADS) return -EINVAL; - error = arch_lock_indir_br_lp_status(me, arg2); + if (arg4 || arg5) + return -EINVAL; + error = arch_prctl_set_branch_landing_pad_state(me, arg3); + if (error) + break; + if (arg3 & PR_CFI_LOCK && !(arg3 & PR_CFI_DISABLE)) + error = arch_prctl_lock_branch_landing_pad_state(me); break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9d3a666ffde1..c9efb17cc255 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1118,7 +1118,7 @@ int proc_do_large_bitmap(const struct ctl_table *table, int dir, unsigned long bitmap_len = table->maxlen; unsigned long *bitmap = *(unsigned long **) table->data; unsigned long *tmp_bitmap = NULL; - char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; + char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c = 0; if (!bitmap || !bitmap_len || !left || (*ppos && SYSCTL_KERN_TO_USER(dir))) { *lenp = 0; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 0cd680ccc7e5..73bd6a6a7893 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -649,6 +649,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) goto err; memcpy(stats, tsk->signal->stats, sizeof(*stats)); + stats->version = TASKSTATS_VERSION; send: send_cpu_listeners(rep_skb, listeners); diff --git a/kernel/time/.kunitconfig b/kernel/time/.kunitconfig new file mode 100644 index 000000000000..d60a611b2853 --- /dev/null +++ b/kernel/time/.kunitconfig @@ -0,0 +1,2 @@ +CONFIG_KUNIT=y +CONFIG_TIME_KUNIT_TEST=y diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 7c6a52f7836c..02aac7c5aa76 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -9,14 +9,13 @@ config CLOCKSOURCE_WATCHDOG bool -# Architecture has extra clocksource data -config ARCH_CLOCKSOURCE_DATA - bool - # Architecture has extra clocksource init called from registration config ARCH_CLOCKSOURCE_INIT bool +config ARCH_WANTS_CLOCKSOURCE_READ_INLINE + bool + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool @@ -44,10 +43,23 @@ config GENERIC_CLOCKEVENTS_BROADCAST_IDLE config GENERIC_CLOCKEVENTS_MIN_ADJUST bool +config GENERIC_CLOCKEVENTS_COUPLED + bool + +config GENERIC_CLOCKEVENTS_COUPLED_INLINE + select GENERIC_CLOCKEVENTS_COUPLED + bool + # Generic update of CMOS clock config GENERIC_CMOS_UPDATE bool +# Deferred rearming of the hrtimer interrupt +config HRTIMER_REARM_DEFERRED + def_bool y + depends on GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS + depends on HIGH_RES_TIMERS && SCHED_HRTICK + # Select to handle posix CPU timers from task_work # and not from the timer interrupt context config HAVE_POSIX_CPU_TIMERS_TASK_WORK @@ -196,18 +208,6 @@ config HIGH_RES_TIMERS hardware is not capable then this option only increases the size of the kernel image. -config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US - int "Clocksource watchdog maximum allowable skew (in microseconds)" - depends on CLOCKSOURCE_WATCHDOG - range 50 1000 - default 125 - help - Specify the maximum amount of allowable watchdog skew in - microseconds before reporting the clocksource to be unstable. - The default is based on a half-second clocksource watchdog - interval and NTP's maximum frequency drift of 500 parts - per million. If the clocksource is good enough for NTP, - it is good enough for the clocksource watchdog! endif config POSIX_AUX_CLOCKS diff --git a/kernel/time/Makefile b/kernel/time/Makefile index f7d52d9543cc..eaf290c972f9 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -30,5 +30,6 @@ obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o obj-$(CONFIG_TIME_NS) += namespace.o +obj-$(CONFIG_TIME_NS_VDSO) += namespace_vdso.o obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 069d93bfb0c7..6e173d70d825 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -234,19 +234,23 @@ static int alarmtimer_suspend(struct device *dev) if (!rtc) return 0; - /* Find the soonest timer to expire*/ + /* Find the soonest timer to expire */ for (i = 0; i < ALARM_NUMTYPE; i++) { struct alarm_base *base = &alarm_bases[i]; struct timerqueue_node *next; + ktime_t next_expires; ktime_t delta; - scoped_guard(spinlock_irqsave, &base->lock) + scoped_guard(spinlock_irqsave, &base->lock) { next = timerqueue_getnext(&base->timerqueue); + if (next) + next_expires = next->expires; + } if (!next) continue; - delta = ktime_sub(next->expires, base->get_ktime()); + delta = ktime_sub(next_expires, base->get_ktime()); if (!min || (delta < min)) { - expires = next->expires; + expires = next_expires; min = delta; type = i; } @@ -540,7 +544,7 @@ static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = &timr->it.alarm.alarmtimer; - return alarm_forward(alarm, timr->it_interval, now); + return alarm_forward(alarm, now, timr->it_interval); } /** diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index eaae1ce9f060..b4d730604972 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -172,6 +172,7 @@ void clockevents_shutdown(struct clock_event_device *dev) { clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event = KTIME_MAX; + dev->next_event_forced = 0; } /** @@ -292,6 +293,38 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED +#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE +#include <asm/clock_inlined.h> +#else +static __always_inline void +arch_inlined_clockevent_set_next_coupled(u64 u64 cycles, struct clock_event_device *dev) { } +#endif + +static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) +{ + u64 cycles; + + if (unlikely(!(dev->features & CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED))) + return false; + + if (unlikely(!ktime_expiry_to_cycles(dev->cs_id, expires, &cycles))) + return false; + + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE)) + arch_inlined_clockevent_set_next_coupled(cycles, dev); + else + dev->set_next_coupled(cycles, dev); + return true; +} + +#else +static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) +{ + return false; +} +#endif + /** * clockevents_program_event - Reprogram the clock event device. * @dev: device to program @@ -300,12 +333,10 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) * * Returns 0 on success, -ETIME when the event is in the past. */ -int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, - bool force) +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force) { - unsigned long long clc; int64_t delta; - int rc; + u64 cycles; if (WARN_ON_ONCE(expires < 0)) return -ETIME; @@ -319,21 +350,35 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev)); - /* Shortcut for clockevent devices that can deal with ktime. */ - if (dev->features & CLOCK_EVT_FEAT_KTIME) + /* ktime_t based reprogramming for the broadcast hrtimer device */ + if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER)) return dev->set_next_ktime(expires, dev); + if (likely(clockevent_set_next_coupled(dev, expires))) + return 0; + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); - if (delta <= 0) - return force ? clockevents_program_min_delta(dev) : -ETIME; - delta = min(delta, (int64_t) dev->max_delta_ns); - delta = max(delta, (int64_t) dev->min_delta_ns); + /* Required for tick_periodic() during early boot */ + if (delta <= 0 && !force) + return -ETIME; + + if (delta > (int64_t)dev->min_delta_ns) { + delta = min(delta, (int64_t) dev->max_delta_ns); + cycles = ((u64)delta * dev->mult) >> dev->shift; + if (!dev->set_next_event((unsigned long) cycles, dev)) + return 0; + } - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; - rc = dev->set_next_event((unsigned long) clc, dev); + if (dev->next_event_forced) + return 0; - return (rc && force) ? clockevents_program_min_delta(dev) : rc; + if (dev->set_next_event(dev->min_delta_ticks, dev)) { + if (!force || clockevents_program_min_delta(dev)) + return -ETIME; + } + dev->next_event_forced = 1; + return 0; } /* diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 38dae590b29f..b4cf17b4aeed 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -3,202 +3,196 @@ * Unit test for the clocksource watchdog. * * Copyright (C) 2021 Facebook, Inc. + * Copyright (C) 2026 Intel Corp. * * Author: Paul E. McKenney <paulmck@kernel.org> + * Author: Thomas Gleixner <tglx@kernel.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/device.h> #include <linux/clocksource.h> -#include <linux/init.h> +#include <linux/delay.h> #include <linux/module.h> -#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ -#include <linux/tick.h> #include <linux/kthread.h> -#include <linux/delay.h> -#include <linux/prandom.h> -#include <linux/cpu.h> #include "tick-internal.h" +#include "timekeeping_internal.h" MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Clocksource watchdog unit test"); MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); +MODULE_AUTHOR("Thomas Gleixner <tglx@kernel.org>"); + +enum wdtest_states { + WDTEST_INJECT_NONE, + WDTEST_INJECT_DELAY, + WDTEST_INJECT_POSITIVE, + WDTEST_INJECT_NEGATIVE, + WDTEST_INJECT_PERCPU = 0x100, +}; -static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0; -module_param(holdoff, int, 0444); -MODULE_PARM_DESC(holdoff, "Time to wait to start test (s)."); +static enum wdtest_states wdtest_state; +static unsigned long wdtest_test_count; +static ktime_t wdtest_last_ts, wdtest_offset; -/* Watchdog kthread's task_struct pointer for debug purposes. */ -static struct task_struct *wdtest_task; +#define SHIFT_4000PPM 8 -static u64 wdtest_jiffies_read(struct clocksource *cs) +static ktime_t wdtest_get_offset(struct clocksource *cs) { - return (u64)jiffies; -} - -static struct clocksource clocksource_wdtest_jiffies = { - .name = "wdtest-jiffies", - .rating = 1, /* lowest valid rating*/ - .uncertainty_margin = TICK_NSEC, - .read = wdtest_jiffies_read, - .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_MUST_VERIFY, - .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ - .shift = JIFFIES_SHIFT, - .max_cycles = 10, -}; + if (wdtest_state < WDTEST_INJECT_PERCPU) + return wdtest_test_count & 0x1 ? 0 : wdtest_offset >> SHIFT_4000PPM; -static int wdtest_ktime_read_ndelays; -static bool wdtest_ktime_read_fuzz; + /* Only affect the readout of the "remote" CPU */ + return cs->wd_cpu == smp_processor_id() ? 0 : NSEC_PER_MSEC; +} static u64 wdtest_ktime_read(struct clocksource *cs) { - int wkrn = READ_ONCE(wdtest_ktime_read_ndelays); - static int sign = 1; - u64 ret; + ktime_t now = ktime_get_raw_fast_ns(); + ktime_t intv = now - wdtest_last_ts; - if (wkrn) { - udelay(cs->uncertainty_margin / 250); - WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1); - } - ret = ktime_get_real_fast_ns(); - if (READ_ONCE(wdtest_ktime_read_fuzz)) { - sign = -sign; - ret = ret + sign * 100 * NSEC_PER_MSEC; + /* + * Only increment the test counter once per watchdog interval and + * store the interval for the offset calculation of this step. This + * guarantees a consistent behaviour even if the other side needs + * to repeat due to a watchdog read timeout. + */ + if (intv > (NSEC_PER_SEC / 4)) { + WRITE_ONCE(wdtest_test_count, wdtest_test_count + 1); + wdtest_last_ts = now; + wdtest_offset = intv; } - return ret; -} -static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs) -{ - pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name); + switch (wdtest_state & ~WDTEST_INJECT_PERCPU) { + case WDTEST_INJECT_POSITIVE: + return now + wdtest_get_offset(cs); + case WDTEST_INJECT_NEGATIVE: + return now - wdtest_get_offset(cs); + case WDTEST_INJECT_DELAY: + udelay(500); + return now; + default: + return now; + } } -#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \ - CLOCK_SOURCE_VALID_FOR_HRES | \ - CLOCK_SOURCE_MUST_VERIFY | \ - CLOCK_SOURCE_VERIFY_PERCPU) +#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \ + CLOCK_SOURCE_CALIBRATED | \ + CLOCK_SOURCE_MUST_VERIFY | \ + CLOCK_SOURCE_WDTEST) static struct clocksource clocksource_wdtest_ktime = { .name = "wdtest-ktime", - .rating = 300, + .rating = 10, .read = wdtest_ktime_read, .mask = CLOCKSOURCE_MASK(64), .flags = KTIME_FLAGS, - .mark_unstable = wdtest_ktime_cs_mark_unstable, .list = LIST_HEAD_INIT(clocksource_wdtest_ktime.list), }; -/* Reset the clocksource if needed. */ -static void wdtest_ktime_clocksource_reset(void) +static void wdtest_clocksource_reset(enum wdtest_states which, bool percpu) +{ + clocksource_unregister(&clocksource_wdtest_ktime); + + pr_info("Test: State %d percpu %d\n", which, percpu); + + wdtest_state = which; + if (percpu) + wdtest_state |= WDTEST_INJECT_PERCPU; + wdtest_test_count = 0; + wdtest_last_ts = 0; + + clocksource_wdtest_ktime.rating = 10; + clocksource_wdtest_ktime.flags = KTIME_FLAGS; + if (percpu) + clocksource_wdtest_ktime.flags |= CLOCK_SOURCE_WDTEST_PERCPU; + clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); +} + +static bool wdtest_execute(enum wdtest_states which, bool percpu, unsigned int expect, + unsigned long calls) { - if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) { - clocksource_unregister(&clocksource_wdtest_ktime); - clocksource_wdtest_ktime.flags = KTIME_FLAGS; - schedule_timeout_uninterruptible(HZ / 10); - clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); + wdtest_clocksource_reset(which, percpu); + + for (; READ_ONCE(wdtest_test_count) < calls; msleep(100)) { + unsigned int flags = READ_ONCE(clocksource_wdtest_ktime.flags); + + if (kthread_should_stop()) + return false; + + if (flags & CLOCK_SOURCE_UNSTABLE) { + if (expect & CLOCK_SOURCE_UNSTABLE) + return true; + pr_warn("Fail: Unexpected unstable\n"); + return false; + } + if (flags & CLOCK_SOURCE_VALID_FOR_HRES) { + if (expect & CLOCK_SOURCE_VALID_FOR_HRES) + return true; + pr_warn("Fail: Unexpected valid for highres\n"); + return false; + } } + + if (!expect) + return true; + + pr_warn("Fail: Timed out\n"); + return false; } -/* Run the specified series of watchdog tests. */ -static int wdtest_func(void *arg) +static bool wdtest_run(bool percpu) { - unsigned long j1, j2; - int i, max_retries; - char *s; + if (!wdtest_execute(WDTEST_INJECT_NONE, percpu, CLOCK_SOURCE_VALID_FOR_HRES, 8)) + return false; - schedule_timeout_uninterruptible(holdoff * HZ); + if (!wdtest_execute(WDTEST_INJECT_DELAY, percpu, 0, 4)) + return false; - /* - * Verify that jiffies-like clocksources get the manually - * specified uncertainty margin. - */ - pr_info("--- Verify jiffies-like uncertainty margin.\n"); - __clocksource_register(&clocksource_wdtest_jiffies); - WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC); + if (!wdtest_execute(WDTEST_INJECT_POSITIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8)) + return false; - j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies); - schedule_timeout_uninterruptible(HZ); - j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies); - WARN_ON_ONCE(j1 == j2); + if (!wdtest_execute(WDTEST_INJECT_NEGATIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8)) + return false; - clocksource_unregister(&clocksource_wdtest_jiffies); + return true; +} - /* - * Verify that tsc-like clocksources are assigned a reasonable - * uncertainty margin. - */ - pr_info("--- Verify tsc-like uncertainty margin.\n"); +static int wdtest_func(void *arg) +{ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); - WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC); - - j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); - udelay(1); - j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); - pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1); - WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC), - "Expected at least 1000ns, got %lu.\n", j2 - j1); - - /* Verify tsc-like stability with various numbers of errors injected. */ - max_retries = clocksource_get_max_watchdog_retry(); - for (i = 0; i <= max_retries + 1; i++) { - if (i <= 1 && i < max_retries) - s = ""; - else if (i <= max_retries) - s = ", expect message"; - else - s = ", expect clock skew"; - pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s); - WRITE_ONCE(wdtest_ktime_read_ndelays, i); - schedule_timeout_uninterruptible(2 * HZ); - WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays)); - WARN_ON_ONCE((i <= max_retries) != - !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); - wdtest_ktime_clocksource_reset(); + if (wdtest_run(false)) { + if (wdtest_run(true)) + pr_info("Success: All tests passed\n"); } - - /* Verify tsc-like stability with clock-value-fuzz error injection. */ - pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n"); - WRITE_ONCE(wdtest_ktime_read_fuzz, true); - schedule_timeout_uninterruptible(2 * HZ); - WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); - clocksource_verify_percpu(&clocksource_wdtest_ktime); - WRITE_ONCE(wdtest_ktime_read_fuzz, false); - clocksource_unregister(&clocksource_wdtest_ktime); - pr_info("--- Done with test.\n"); - return 0; -} + if (!IS_MODULE(CONFIG_TEST_CLOCKSOURCE_WATCHDOG)) + return 0; -static void wdtest_print_module_parms(void) -{ - pr_alert("--- holdoff=%d\n", holdoff); + while (!kthread_should_stop()) + schedule_timeout_interruptible(3600 * HZ); + return 0; } -/* Cleanup function. */ -static void clocksource_wdtest_cleanup(void) -{ -} +static struct task_struct *wdtest_thread; static int __init clocksource_wdtest_init(void) { - int ret = 0; - - wdtest_print_module_parms(); + struct task_struct *t = kthread_run(wdtest_func, NULL, "wdtest"); - /* Create watchdog-test task. */ - wdtest_task = kthread_run(wdtest_func, NULL, "wdtest"); - if (IS_ERR(wdtest_task)) { - ret = PTR_ERR(wdtest_task); - pr_warn("%s: Failed to create wdtest kthread.\n", __func__); - wdtest_task = NULL; - return ret; + if (IS_ERR(t)) { + pr_warn("Failed to create wdtest kthread.\n"); + return PTR_ERR(t); } - + wdtest_thread = t; return 0; } - module_init(clocksource_wdtest_init); + +static void clocksource_wdtest_cleanup(void) +{ + if (wdtest_thread) + kthread_stop(wdtest_thread); +} module_exit(clocksource_wdtest_cleanup); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index df7194961658..baee13a1f87f 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -7,15 +7,17 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/device.h> #include <linux/clocksource.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/device.h> #include <linux/init.h> -#include <linux/module.h> -#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ -#include <linux/tick.h> #include <linux/kthread.h> +#include <linux/module.h> #include <linux/prandom.h> -#include <linux/cpu.h> +#include <linux/sched.h> +#include <linux/tick.h> +#include <linux/topology.h> #include "tick-internal.h" #include "timekeeping_internal.h" @@ -107,48 +109,6 @@ static char override_name[CS_NAME_LEN]; static int finished_booting; static u64 suspend_start; -/* - * Interval: 0.5sec. - */ -#define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ)) - -/* - * Threshold: 0.0312s, when doubled: 0.0625s. - */ -#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5) - -/* - * Maximum permissible delay between two readouts of the watchdog - * clocksource surrounding a read of the clocksource being validated. - * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as - * a lower bound for cs->uncertainty_margin values when registering clocks. - * - * The default of 500 parts per million is based on NTP's limits. - * If a clocksource is good enough for NTP, it is good enough for us! - * - * In other words, by default, even if a clocksource is extremely - * precise (for example, with a sub-nanosecond period), the maximum - * permissible skew between the clocksource watchdog and the clocksource - * under test is not permitted to go below the 500ppm minimum defined - * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the - * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option. - */ -#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US -#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US -#else -#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ) -#endif - -/* - * Default for maximum permissible skew when cs->uncertainty_margin is - * not specified, and the lower bound even when cs->uncertainty_margin - * is specified. This is also the default that is used when registering - * clocks with unspecified cs->uncertainty_margin, so this macro is used - * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. - */ -#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) - #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); static void clocksource_select(void); @@ -160,7 +120,42 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; -static int64_t watchdog_max_interval; + +/* Watchdog interval: 0.5sec. */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_INTERVAL_NS (WATCHDOG_INTERVAL * (NSEC_PER_SEC / HZ)) + +/* Maximum time between two reference watchdog readouts */ +#define WATCHDOG_READOUT_MAX_NS (50U * NSEC_PER_USEC) + +/* + * Maximum time between two remote readouts for NUMA=n. On NUMA enabled systems + * the timeout is calculated from the numa distance. + */ +#define WATCHDOG_DEFAULT_TIMEOUT_NS (50U * NSEC_PER_USEC) + +/* + * Remote timeout NUMA distance multiplier. The local distance is 10. The + * default remote distance is 20. ACPI tables provide more accurate numbers + * which are guaranteed to be greater than the local distance. + * + * This results in a 5us base value, which is equivalent to the above !NUMA + * default. + */ +#define WATCHDOG_NUMA_MULTIPLIER_NS ((u64)(WATCHDOG_DEFAULT_TIMEOUT_NS / LOCAL_DISTANCE)) + +/* Limit the NUMA timeout in case the distance values are insanely big */ +#define WATCHDOG_NUMA_MAX_TIMEOUT_NS ((u64)(500U * NSEC_PER_USEC)) + +/* Shift values to calculate the approximate $N ppm of a given delta. */ +#define SHIFT_500PPM 11 +#define SHIFT_4000PPM 8 + +/* Number of attempts to read the watchdog */ +#define WATCHDOG_FREQ_RETRIES 3 + +/* Five reads local and remote for inter CPU skew detection */ +#define WATCHDOG_REMOTE_MAX_SEQ 10 static inline void clocksource_watchdog_lock(unsigned long *flags) { @@ -241,204 +236,422 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } -static int verify_n_cpus = 8; -module_param(verify_n_cpus, int, 0644); +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; -enum wd_read_status { - WD_READ_SUCCESS, - WD_READ_UNSTABLE, - WD_READ_SKIP + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + +enum wd_result { + WD_SUCCESS, + WD_FREQ_NO_WATCHDOG, + WD_FREQ_TIMEOUT, + WD_FREQ_RESET, + WD_FREQ_SKEWED, + WD_CPU_TIMEOUT, + WD_CPU_SKEWED, +}; + +struct watchdog_cpu_data { + /* Keep first as it is 32 byte aligned */ + call_single_data_t csd; + atomic_t remote_inprogress; + enum wd_result result; + u64 cpu_ts[2]; + struct clocksource *cs; + /* Ensure that the sequence is in a separate cache line */ + atomic_t seq ____cacheline_aligned; + /* Set by the control CPU according to NUMA distance */ + u64 timeout_ns; }; -static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) -{ - int64_t md = watchdog->uncertainty_margin; - unsigned int nretries, max_retries; - int64_t wd_delay, wd_seq_delay; - u64 wd_end, wd_end2; - - max_retries = clocksource_get_max_watchdog_retry(); - for (nretries = 0; nretries <= max_retries; nretries++) { - local_irq_disable(); - *wdnow = watchdog->read(watchdog); - *csnow = cs->read(cs); - wd_end = watchdog->read(watchdog); - wd_end2 = watchdog->read(watchdog); - local_irq_enable(); - - wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); - if (wd_delay <= md + cs->uncertainty_margin) { - if (nretries > 1 && nretries >= max_retries) { - pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", - smp_processor_id(), watchdog->name, nretries); +struct watchdog_data { + raw_spinlock_t lock; + enum wd_result result; + + u64 wd_seq; + u64 wd_delta; + u64 cs_delta; + u64 cpu_ts[2]; + + unsigned int curr_cpu; +} ____cacheline_aligned_in_smp; + +static void watchdog_check_skew_remote(void *unused); + +static DEFINE_PER_CPU_ALIGNED(struct watchdog_cpu_data, watchdog_cpu_data) = { + .csd = CSD_INIT(watchdog_check_skew_remote, NULL), +}; + +static struct watchdog_data watchdog_data = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(watchdog_data.lock), +}; + +static inline void watchdog_set_result(struct watchdog_cpu_data *wd, enum wd_result result) +{ + guard(raw_spinlock)(&watchdog_data.lock); + if (!wd->result) { + atomic_set(&wd->seq, WATCHDOG_REMOTE_MAX_SEQ); + WRITE_ONCE(wd->result, result); + } +} + +/* Wait for the sequence number to hand over control. */ +static bool watchdog_wait_seq(struct watchdog_cpu_data *wd, u64 start, int seq) +{ + for(int cnt = 0; atomic_read(&wd->seq) < seq; cnt++) { + /* Bail if the other side set an error result */ + if (READ_ONCE(wd->result) != WD_SUCCESS) + return false; + + /* Prevent endless loops if the other CPU does not react. */ + if (cnt == 5000) { + u64 nsecs = ktime_get_raw_fast_ns(); + + if (nsecs - start >=wd->timeout_ns) { + watchdog_set_result(wd, WD_CPU_TIMEOUT); + return false; } - return WD_READ_SUCCESS; + cnt = 0; } + cpu_relax(); + } + return seq < WATCHDOG_REMOTE_MAX_SEQ; +} - /* - * Now compute delay in consecutive watchdog read to see if - * there is too much external interferences that cause - * significant delay in reading both clocksource and watchdog. - * - * If consecutive WD read-back delay > md, report - * system busy, reinit the watchdog and skip the current - * watchdog test. - */ - wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2); - if (wd_seq_delay > md) - goto skip_test; +static void watchdog_check_skew(struct watchdog_cpu_data *wd, int index) +{ + u64 prev, now, delta, start = ktime_get_raw_fast_ns(); + int local = index, remote = (index + 1) & 0x1; + struct clocksource *cs = wd->cs; + + /* Set the local timestamp so that the first iteration works correctly */ + wd->cpu_ts[local] = cs->read(cs); + + /* Signal arrival */ + atomic_inc(&wd->seq); + + for (int seq = local + 2; seq < WATCHDOG_REMOTE_MAX_SEQ; seq += 2) { + if (!watchdog_wait_seq(wd, start, seq)) + return; + + /* Capture local timestamp before possible non-local coherency overhead */ + now = cs->read(cs); + + /* Store local timestamp before reading remote to limit coherency stalls */ + wd->cpu_ts[local] = now; + + prev = wd->cpu_ts[remote]; + delta = (now - prev) & cs->mask; + + if (delta > cs->max_raw_delta) { + watchdog_set_result(wd, WD_CPU_SKEWED); + return; + } + + /* Hand over to the remote CPU */ + atomic_inc(&wd->seq); } +} - pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n", - smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name); - return WD_READ_UNSTABLE; +static void watchdog_check_skew_remote(void *unused) +{ + struct watchdog_cpu_data *wd = this_cpu_ptr(&watchdog_cpu_data); -skip_test: - pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n", - smp_processor_id(), watchdog->name, wd_seq_delay); - pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n", - cs->name, wd_delay); - return WD_READ_SKIP; + atomic_inc(&wd->remote_inprogress); + watchdog_check_skew(wd, 1); + atomic_dec(&wd->remote_inprogress); } -static u64 csnow_mid; -static cpumask_t cpus_ahead; -static cpumask_t cpus_behind; -static cpumask_t cpus_chosen; +static inline bool wd_csd_locked(struct watchdog_cpu_data *wd) +{ + return READ_ONCE(wd->csd.node.u_flags) & CSD_FLAG_LOCK; +} + +/* + * This is only invoked for remote CPUs. See watchdog_check_cpu_skew(). + */ +static inline u64 wd_get_remote_timeout(unsigned int remote_cpu) +{ + unsigned int n1, n2; + u64 ns; + + if (nr_node_ids == 1) + return WATCHDOG_DEFAULT_TIMEOUT_NS; + + n1 = cpu_to_node(smp_processor_id()); + n2 = cpu_to_node(remote_cpu); + ns = WATCHDOG_NUMA_MULTIPLIER_NS * node_distance(n1, n2); + return min(ns, WATCHDOG_NUMA_MAX_TIMEOUT_NS); +} -static void clocksource_verify_choose_cpus(void) +static void __watchdog_check_cpu_skew(struct clocksource *cs, unsigned int cpu) { - int cpu, i, n = verify_n_cpus; + struct watchdog_cpu_data *wd; - if (n < 0 || n >= num_online_cpus()) { - /* Check all of the CPUs. */ - cpumask_copy(&cpus_chosen, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); + wd = per_cpu_ptr(&watchdog_cpu_data, cpu); + if (atomic_read(&wd->remote_inprogress) || wd_csd_locked(wd)) { + watchdog_data.result = WD_CPU_TIMEOUT; return; } - /* If no checking desired, or no other CPU to check, leave. */ - cpumask_clear(&cpus_chosen); - if (n == 0 || num_online_cpus() <= 1) + atomic_set(&wd->seq, 0); + wd->result = WD_SUCCESS; + wd->cs = cs; + /* Store the current CPU ID for the watchdog test unit */ + cs->wd_cpu = smp_processor_id(); + + wd->timeout_ns = wd_get_remote_timeout(cpu); + + /* Kick the remote CPU into the watchdog function */ + if (WARN_ON_ONCE(smp_call_function_single_async(cpu, &wd->csd))) { + watchdog_data.result = WD_CPU_TIMEOUT; + return; + } + + scoped_guard(irq) + watchdog_check_skew(wd, 0); + + scoped_guard(raw_spinlock_irq, &watchdog_data.lock) { + watchdog_data.result = wd->result; + memcpy(watchdog_data.cpu_ts, wd->cpu_ts, sizeof(wd->cpu_ts)); + } +} + +static void watchdog_check_cpu_skew(struct clocksource *cs) +{ + unsigned int cpu = watchdog_data.curr_cpu; + + cpu = cpumask_next_wrap(cpu, cpu_online_mask); + watchdog_data.curr_cpu = cpu; + + /* Skip the current CPU. Handles num_online_cpus() == 1 as well */ + if (cpu == smp_processor_id()) return; - /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_any_but(cpu_online_mask, smp_processor_id()); - if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + /* Don't interfere with the test mechanics */ + if ((cs->flags & CLOCK_SOURCE_WDTEST) && !(cs->flags & CLOCK_SOURCE_WDTEST_PERCPU)) return; - cpumask_set_cpu(cpu, &cpus_chosen); - /* Force a sane value for the boot parameter. */ - if (n > nr_cpu_ids) - n = nr_cpu_ids; + __watchdog_check_cpu_skew(cs, cpu); +} + +static bool watchdog_check_freq(struct clocksource *cs, bool reset_pending) +{ + unsigned int ppm_shift = SHIFT_4000PPM; + u64 wd_ts0, wd_ts1, cs_ts; + + watchdog_data.result = WD_SUCCESS; + if (!watchdog) { + watchdog_data.result = WD_FREQ_NO_WATCHDOG; + return false; + } + + if (cs->flags & CLOCK_SOURCE_WDTEST_PERCPU) + return true; /* - * Randomly select the specified number of CPUs. If the same - * CPU is selected multiple times, that CPU is checked only once, - * and no replacement CPU is selected. This gracefully handles - * situations where verify_n_cpus is greater than the number of - * CPUs that are currently online. + * If both the clocksource and the watchdog claim they are + * calibrated use 500ppm limit. Uncalibrated clocksources need a + * larger allowance because thefirmware supplied frequencies can be + * way off. */ - for (i = 1; i < n; i++) { - cpu = cpumask_random(cpu_online_mask); - if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) - cpumask_set_cpu(cpu, &cpus_chosen); + if (watchdog->flags & CLOCK_SOURCE_CALIBRATED && cs->flags & CLOCK_SOURCE_CALIBRATED) + ppm_shift = SHIFT_500PPM; + + for (int retries = 0; retries < WATCHDOG_FREQ_RETRIES; retries++) { + s64 wd_last, cs_last, wd_seq, wd_delta, cs_delta, max_delta; + + scoped_guard(irq) { + wd_ts0 = watchdog->read(watchdog); + cs_ts = cs->read(cs); + wd_ts1 = watchdog->read(watchdog); + } + + wd_last = cs->wd_last; + cs_last = cs->cs_last; + + /* Validate the watchdog readout window */ + wd_seq = cycles_to_nsec_safe(watchdog, wd_ts0, wd_ts1); + if (wd_seq > WATCHDOG_READOUT_MAX_NS) { + /* Store for printout in case all retries fail */ + watchdog_data.wd_seq = wd_seq; + continue; + } + + /* Store for subsequent processing */ + cs->wd_last = wd_ts0; + cs->cs_last = cs_ts; + + /* First round or reset pending? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || reset_pending) + goto reset; + + /* Calculate the nanosecond deltas from the last invocation */ + wd_delta = cycles_to_nsec_safe(watchdog, wd_last, wd_ts0); + cs_delta = cycles_to_nsec_safe(cs, cs_last, cs_ts); + + watchdog_data.wd_delta = wd_delta; + watchdog_data.cs_delta = cs_delta; + + /* + * Ensure that the deltas are within the readout limits of + * the clocksource and the watchdog. Long delays can cause + * clocksources to overflow. + */ + max_delta = max(wd_delta, cs_delta); + if (max_delta > cs->max_idle_ns || max_delta > watchdog->max_idle_ns) + goto reset; + + /* + * Calculate and validate the skew against the allowed PPM + * value of the maximum delta plus the watchdog readout + * time. + */ + if (abs(wd_delta - cs_delta) < (max_delta >> ppm_shift) + wd_seq) + return true; + + watchdog_data.result = WD_FREQ_SKEWED; + return false; } - /* Don't verify ourselves. */ - cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); + watchdog_data.result = WD_FREQ_TIMEOUT; + return false; + +reset: + cs->flags |= CLOCK_SOURCE_WATCHDOG; + watchdog_data.result = WD_FREQ_RESET; + return false; } -static void clocksource_verify_one_cpu(void *csin) +/* Synchronization for sched clock */ +static void clocksource_tick_stable(struct clocksource *cs) { - struct clocksource *cs = (struct clocksource *)csin; - - csnow_mid = cs->read(cs); + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); } -void clocksource_verify_percpu(struct clocksource *cs) +/* Conditionaly enable high resolution mode */ +static void clocksource_enable_highres(struct clocksource *cs) { - int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX; - u64 csnow_begin, csnow_end; - int cpu, testcpu; - s64 delta; + if ((cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) || + !(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) || + !watchdog || !(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) + return; + + /* Mark it valid for high-res. */ + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - if (verify_n_cpus == 0) + /* + * Can't schedule work before finished_booting is + * true. clocksource_done_booting will take care of it. + */ + if (!finished_booting) return; - cpumask_clear(&cpus_ahead); - cpumask_clear(&cpus_behind); - cpus_read_lock(); - migrate_disable(); - clocksource_verify_choose_cpus(); - if (cpumask_empty(&cpus_chosen)) { - migrate_enable(); - cpus_read_unlock(); - pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); + + if (cs->flags & CLOCK_SOURCE_WDTEST) return; + + /* + * If this is not the current clocksource let the watchdog thread + * reselect it. Due to the change to high res this clocksource + * might be preferred now. If it is the current clocksource let the + * tick code know about that change. + */ + if (cs != curr_clocksource) { + cs->flags |= CLOCK_SOURCE_RESELECT; + schedule_work(&watchdog_work); + } else { + tick_clock_notify(); } - testcpu = smp_processor_id(); - pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", - cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); - preempt_disable(); - for_each_cpu(cpu, &cpus_chosen) { - if (cpu == testcpu) - continue; - csnow_begin = cs->read(cs); - smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1); - csnow_end = cs->read(cs); - delta = (s64)((csnow_mid - csnow_begin) & cs->mask); - if (delta < 0) - cpumask_set_cpu(cpu, &cpus_behind); - delta = (csnow_end - csnow_mid) & cs->mask; - if (delta < 0) - cpumask_set_cpu(cpu, &cpus_ahead); - cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end); - if (cs_nsec > cs_nsec_max) - cs_nsec_max = cs_nsec; - if (cs_nsec < cs_nsec_min) - cs_nsec_min = cs_nsec; +} + +static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 2); + +static void watchdog_print_freq_timeout(struct clocksource *cs) +{ + if (!__ratelimit(&ratelimit_state)) + return; + pr_info("Watchdog %s read timed out. Readout sequence took: %lluns\n", + watchdog->name, watchdog_data.wd_seq); +} + +static void watchdog_print_freq_skew(struct clocksource *cs) +{ + pr_warn("Marking clocksource %s unstable due to frequency skew\n", cs->name); + pr_warn("Watchdog %20s interval: %16lluns\n", watchdog->name, watchdog_data.wd_delta); + pr_warn("Clocksource %20s interval: %16lluns\n", cs->name, watchdog_data.cs_delta); +} + +static void watchdog_handle_remote_timeout(struct clocksource *cs) +{ + pr_info_once("Watchdog remote CPU %u read timed out\n", watchdog_data.curr_cpu); +} + +static void watchdog_print_remote_skew(struct clocksource *cs) +{ + pr_warn("Marking clocksource %s unstable due to inter CPU skew\n", cs->name); + if (watchdog_data.cpu_ts[0] < watchdog_data.cpu_ts[1]) { + pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", smp_processor_id(), + watchdog_data.cpu_ts[0], watchdog_data.curr_cpu, watchdog_data.cpu_ts[1]); + } else { + pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", watchdog_data.curr_cpu, + watchdog_data.cpu_ts[1], smp_processor_id(), watchdog_data.cpu_ts[0]); } - preempt_enable(); - migrate_enable(); - cpus_read_unlock(); - if (!cpumask_empty(&cpus_ahead)) - pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", - cpumask_pr_args(&cpus_ahead), testcpu, cs->name); - if (!cpumask_empty(&cpus_behind)) - pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", - cpumask_pr_args(&cpus_behind), testcpu, cs->name); - pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", - testcpu, cs_nsec_min, cs_nsec_max, cs->name); -} -EXPORT_SYMBOL_GPL(clocksource_verify_percpu); +} -static inline void clocksource_reset_watchdog(void) +static void watchdog_check_result(struct clocksource *cs) { - struct clocksource *cs; + switch (watchdog_data.result) { + case WD_SUCCESS: + clocksource_tick_stable(cs); + clocksource_enable_highres(cs); + return; - list_for_each_entry(cs, &watchdog_list, wd_list) + case WD_FREQ_TIMEOUT: + watchdog_print_freq_timeout(cs); + /* Try again later and invalidate the reference timestamps. */ cs->flags &= ~CLOCK_SOURCE_WATCHDOG; -} + return; + case WD_FREQ_NO_WATCHDOG: + case WD_FREQ_RESET: + /* + * Nothing to do when the reference timestamps were reset + * or no watchdog clocksource registered. + */ + return; + + case WD_FREQ_SKEWED: + watchdog_print_freq_skew(cs); + break; + + case WD_CPU_TIMEOUT: + /* Remote check timed out. Try again next cycle. */ + watchdog_handle_remote_timeout(cs); + return; + + case WD_CPU_SKEWED: + watchdog_print_remote_skew(cs); + break; + } + __clocksource_unstable(cs); +} static void clocksource_watchdog(struct timer_list *unused) { - int64_t wd_nsec, cs_nsec, interval; - u64 csnow, wdnow, cslast, wdlast; - int next_cpu, reset_pending; struct clocksource *cs; - enum wd_read_status read_ret; - unsigned long extra_wait = 0; - u32 md; + bool reset_pending; - spin_lock(&watchdog_lock); + guard(spinlock)(&watchdog_lock); if (!watchdog_running) - goto out; + return; reset_pending = atomic_read(&watchdog_reset_pending); list_for_each_entry(cs, &watchdog_list, wd_list) { - /* Clocksource already marked unstable? */ if (cs->flags & CLOCK_SOURCE_UNSTABLE) { if (finished_booting) @@ -446,170 +659,40 @@ static void clocksource_watchdog(struct timer_list *unused) continue; } - read_ret = cs_watchdog_read(cs, &csnow, &wdnow); - - if (read_ret == WD_READ_UNSTABLE) { - /* Clock readout unreliable, so give it up. */ - __clocksource_unstable(cs); - continue; - } - - /* - * When WD_READ_SKIP is returned, it means the system is likely - * under very heavy load, where the latency of reading - * watchdog/clocksource is very big, and affect the accuracy of - * watchdog check. So give system some space and suspend the - * watchdog check for 5 minutes. - */ - if (read_ret == WD_READ_SKIP) { - /* - * As the watchdog timer will be suspended, and - * cs->last could keep unchanged for 5 minutes, reset - * the counters. - */ - clocksource_reset_watchdog(); - extra_wait = HZ * 300; - break; - } - - /* Clocksource initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || - atomic_read(&watchdog_reset_pending)) { - cs->flags |= CLOCK_SOURCE_WATCHDOG; - cs->wd_last = wdnow; - cs->cs_last = csnow; - continue; + /* Compare against watchdog clocksource if available */ + if (watchdog_check_freq(cs, reset_pending)) { + /* Check for inter CPU skew */ + watchdog_check_cpu_skew(cs); } - wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow); - cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow); - wdlast = cs->wd_last; /* save these in case we print them */ - cslast = cs->cs_last; - cs->cs_last = csnow; - cs->wd_last = wdnow; - - if (atomic_read(&watchdog_reset_pending)) - continue; - - /* - * The processing of timer softirqs can get delayed (usually - * on account of ksoftirqd not getting to run in a timely - * manner), which causes the watchdog interval to stretch. - * Skew detection may fail for longer watchdog intervals - * on account of fixed margins being used. - * Some clocksources, e.g. acpi_pm, cannot tolerate - * watchdog intervals longer than a few seconds. - */ - interval = max(cs_nsec, wd_nsec); - if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) { - if (system_state > SYSTEM_SCHEDULING && - interval > 2 * watchdog_max_interval) { - watchdog_max_interval = interval; - pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n", - cs_nsec, wd_nsec); - } - watchdog_timer.expires = jiffies; - continue; - } - - /* Check the deviation from the watchdog clocksource. */ - md = cs->uncertainty_margin + watchdog->uncertainty_margin; - if (abs(cs_nsec - wd_nsec) > md) { - s64 cs_wd_msec; - s64 wd_msec; - u32 wd_rem; - - pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", - smp_processor_id(), cs->name); - pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n", - watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); - pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", - cs->name, cs_nsec, csnow, cslast, cs->mask); - cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem); - wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem); - pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n", - cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec); - if (curr_clocksource == cs) - pr_warn(" '%s' is current clocksource.\n", cs->name); - else if (curr_clocksource) - pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name); - else - pr_warn(" No current clocksource.\n"); - __clocksource_unstable(cs); - continue; - } - - if (cs == curr_clocksource && cs->tick_stable) - cs->tick_stable(cs); - - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && - (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { - /* Mark it valid for high-res. */ - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - - /* - * clocksource_done_booting() will sort it if - * finished_booting is not set yet. - */ - if (!finished_booting) - continue; - - /* - * If this is not the current clocksource let - * the watchdog thread reselect it. Due to the - * change to high res this clocksource might - * be preferred now. If it is the current - * clocksource let the tick code know about - * that change. - */ - if (cs != curr_clocksource) { - cs->flags |= CLOCK_SOURCE_RESELECT; - schedule_work(&watchdog_work); - } else { - tick_clock_notify(); - } - } + watchdog_check_result(cs); } - /* - * We only clear the watchdog_reset_pending, when we did a - * full cycle through all clocksources. - */ + /* Clear after the full clocksource walk */ if (reset_pending) atomic_dec(&watchdog_reset_pending); - /* - * Cycle through CPUs to check if the CPUs stay synchronized - * to each other. - */ - next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask); - - /* - * Arm timer if not already pending: could race with concurrent - * pair clocksource_stop_watchdog() clocksource_start_watchdog(). - */ + /* Could have been rearmed by a stop/start cycle */ if (!timer_pending(&watchdog_timer)) { - watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait; - add_timer_on(&watchdog_timer, next_cpu); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_local(&watchdog_timer); } -out: - spin_unlock(&watchdog_lock); } static inline void clocksource_start_watchdog(void) { - if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + if (watchdog_running || list_empty(&watchdog_list)) return; - timer_setup(&watchdog_timer, clocksource_watchdog, 0); + timer_setup(&watchdog_timer, clocksource_watchdog, TIMER_PINNED); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + + add_timer_on(&watchdog_timer, get_boot_cpu_id()); watchdog_running = 1; } static inline void clocksource_stop_watchdog(void) { - if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + if (!watchdog_running || !list_empty(&watchdog_list)) return; timer_delete(&watchdog_timer); watchdog_running = 0; @@ -651,6 +734,13 @@ static void clocksource_select_watchdog(bool fallback) if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) continue; + /* + * If it's not continuous, don't put the fox in charge of + * the henhouse. + */ + if (!(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)) + continue; + /* Skip current if we were requested for a fallback. */ if (fallback && cs == old_wd) continue; @@ -690,12 +780,6 @@ static int __clocksource_watchdog_kthread(void) unsigned long flags; int select = 0; - /* Do any required per-CPU skew verification. */ - if (curr_clocksource && - curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE && - curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU) - clocksource_verify_percpu(curr_clocksource); - spin_lock_irqsave(&watchdog_lock, flags); list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { @@ -1016,6 +1100,8 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) continue; if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) continue; + if (cs->flags & CLOCK_SOURCE_WDTEST) + continue; return cs; } return NULL; @@ -1040,6 +1126,8 @@ static void __clocksource_select(bool skipcur) continue; if (strcmp(cs->name, override_name) != 0) continue; + if (cs->flags & CLOCK_SOURCE_WDTEST) + continue; /* * Check to make sure we don't switch to a non-highres * capable clocksource if the tick code is in oneshot @@ -1169,31 +1257,10 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, NSEC_PER_SEC / scale, sec * scale); - } - /* - * If the uncertainty margin is not specified, calculate it. If - * both scale and freq are non-zero, calculate the clock period, but - * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default. - * However, if either of scale or freq is zero, be very conservative - * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value - * for the uncertainty margin. Allow stupidly small uncertainty - * margins to be specified by the caller for testing purposes, - * but warn to discourage production use of this capability. - * - * Bottom line: The sum of the uncertainty margins of the - * watchdog clocksource and the clocksource under test will be at - * least 500ppm by default. For more information, please see the - * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above. - */ - if (scale && freq && !cs->uncertainty_margin) { - cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq); - if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW) - cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW; - } else if (!cs->uncertainty_margin) { - cs->uncertainty_margin = WATCHDOG_THRESHOLD; + /* Update cs::freq_khz */ + cs->freq_khz = div_u64((u64)freq * scale, 1000); } - WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW); /* * Ensure clocksources that have large 'mult' values don't overflow @@ -1241,6 +1308,10 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX)) cs->id = CSID_GENERIC; + + if (WARN_ON_ONCE(!freq && cs->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT)) + cs->flags &= ~CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT; + if (cs->vdso_clock_mode < 0 || cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 860af7a58428..5bd6efe598f0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -50,6 +50,28 @@ #include "tick-internal.h" /* + * Constants to set the queued state of the timer (INACTIVE, ENQUEUED) + * + * The callback state is kept separate in the CPU base because having it in + * the timer would required touching the timer after the callback, which + * makes it impossible to free the timer from the callback function. + * + * Therefore we track the callback state in: + * + * timer->base->cpu_base->running == timer + * + * On SMP it is possible to have a "callback function running and enqueued" + * status. It happens for example when a posix timer expired and the callback + * queued a signal. Between dropping the lock which protects the posix timer + * and reacquiring the base lock of the hrtimer, another CPU can deliver the + * signal and rearm the timer. + * + * All state transitions are protected by cpu_base->lock. + */ +#define HRTIMER_STATE_INACTIVE false +#define HRTIMER_STATE_ENQUEUED true + +/* * The resolution of the clocks. The resolution value is returned in * the clock_getres() system call to give application programmers an * idea of the (in)accuracy of timers. Timer values are rounded up to @@ -77,43 +99,22 @@ static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. */ + +#define BASE_INIT(idx, cid) \ + [idx] = { .index = idx, .clockid = cid } + DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), - .clock_base = - { - { - .index = HRTIMER_BASE_MONOTONIC, - .clockid = CLOCK_MONOTONIC, - }, - { - .index = HRTIMER_BASE_REALTIME, - .clockid = CLOCK_REALTIME, - }, - { - .index = HRTIMER_BASE_BOOTTIME, - .clockid = CLOCK_BOOTTIME, - }, - { - .index = HRTIMER_BASE_TAI, - .clockid = CLOCK_TAI, - }, - { - .index = HRTIMER_BASE_MONOTONIC_SOFT, - .clockid = CLOCK_MONOTONIC, - }, - { - .index = HRTIMER_BASE_REALTIME_SOFT, - .clockid = CLOCK_REALTIME, - }, - { - .index = HRTIMER_BASE_BOOTTIME_SOFT, - .clockid = CLOCK_BOOTTIME, - }, - { - .index = HRTIMER_BASE_TAI_SOFT, - .clockid = CLOCK_TAI, - }, + .clock_base = { + BASE_INIT(HRTIMER_BASE_MONOTONIC, CLOCK_MONOTONIC), + BASE_INIT(HRTIMER_BASE_REALTIME, CLOCK_REALTIME), + BASE_INIT(HRTIMER_BASE_BOOTTIME, CLOCK_BOOTTIME), + BASE_INIT(HRTIMER_BASE_TAI, CLOCK_TAI), + BASE_INIT(HRTIMER_BASE_MONOTONIC_SOFT, CLOCK_MONOTONIC), + BASE_INIT(HRTIMER_BASE_REALTIME_SOFT, CLOCK_REALTIME), + BASE_INIT(HRTIMER_BASE_BOOTTIME_SOFT, CLOCK_BOOTTIME), + BASE_INIT(HRTIMER_BASE_TAI_SOFT, CLOCK_TAI), }, .csd = CSD_INIT(retrigger_next_event, NULL) }; @@ -126,23 +127,43 @@ static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) return likely(base->online); } +#ifdef CONFIG_HIGH_RES_TIMERS +DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key); + +static void hrtimer_hres_workfn(struct work_struct *work) +{ + static_branch_enable(&hrtimer_highres_enabled_key); +} + +static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn); + +static inline void hrtimer_schedule_hres_work(void) +{ + if (!hrtimer_highres_enabled()) + schedule_work(&hrtimer_hres_work); +} +#else +static inline void hrtimer_schedule_hres_work(void) { } +#endif + /* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ #ifdef CONFIG_SMP - /* * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() * such that hrtimer_callback_running() can unconditionally dereference * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .clock_base = { { - .cpu_base = &migration_cpu_base, - .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, - &migration_cpu_base.lock), - }, }, + .clock_base = { + [0] = { + .cpu_base = &migration_cpu_base, + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, + &migration_cpu_base.lock), + }, + }, }; #define migration_base migration_cpu_base.clock_base[0] @@ -159,15 +180,13 @@ static struct hrtimer_cpu_base migration_cpu_base = { * possible to set timer->base = &migration_base and drop the lock: the timer * remains locked. */ -static -struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, - unsigned long *flags) +static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) __acquires(&timer->base->lock) { - struct hrtimer_clock_base *base; - for (;;) { - base = READ_ONCE(timer->base); + struct hrtimer_clock_base *base = READ_ONCE(timer->base); + if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) @@ -220,7 +239,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_ return expires >= new_base->cpu_base->expires_next; } -static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, bool pinned) { if (!hrtimer_base_is_online(base)) { int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); @@ -248,8 +267,7 @@ static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base * * the timer callback is currently running. */ static inline struct hrtimer_clock_base * -switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, - int pinned) +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, bool pinned) { struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; @@ -262,13 +280,12 @@ again: if (base != new_base) { /* - * We are trying to move timer to new_base. - * However we can't change timer's base while it is running, - * so we keep it on the same CPU. No hassle vs. reprogramming - * the event source in the high resolution case. The softirq - * code will take care of this when the timer function has - * completed. There is no conflict as we hold the lock until - * the timer is enqueued. + * We are trying to move timer to new_base. However we can't + * change timer's base while it is running, so we keep it on + * the same CPU. No hassle vs. reprogramming the event source + * in the high resolution case. The remote CPU will take care + * of this when the timer function has completed. There is no + * conflict as we hold the lock until the timer is enqueued. */ if (unlikely(hrtimer_callback_running(timer))) return base; @@ -278,8 +295,7 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, - this_cpu_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -298,14 +314,13 @@ again: #else /* CONFIG_SMP */ -static inline struct hrtimer_clock_base * -lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +static inline struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) __acquires(&timer->base->cpu_base->lock) { struct hrtimer_clock_base *base = timer->base; raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); - return base; } @@ -340,7 +355,7 @@ s64 __ktime_divns(const ktime_t kt, s64 div) return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); -#endif /* BITS_PER_LONG >= 64 */ +#endif /* BITS_PER_LONG < 64 */ /* * Add two ktime values and do a safety check for overflow: @@ -422,12 +437,37 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) } } +/* Stub timer callback for improperly used timers. */ +static enum hrtimer_restart stub_timer(struct hrtimer *unused) +{ + WARN_ON_ONCE(1); + return HRTIMER_NORESTART; +} + +/* + * hrtimer_fixup_assert_init is called when: + * - an untracked/uninit-ed object is found + */ +static bool hrtimer_fixup_assert_init(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_NOTAVAILABLE: + hrtimer_setup(timer, stub_timer, CLOCK_MONOTONIC, 0); + return true; + default: + return false; + } +} + static const struct debug_obj_descr hrtimer_debug_descr = { - .name = "hrtimer", - .debug_hint = hrtimer_debug_hint, - .fixup_init = hrtimer_fixup_init, - .fixup_activate = hrtimer_fixup_activate, - .fixup_free = hrtimer_fixup_free, + .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, + .fixup_init = hrtimer_fixup_init, + .fixup_activate = hrtimer_fixup_activate, + .fixup_free = hrtimer_fixup_free, + .fixup_assert_init = hrtimer_fixup_assert_init, }; static inline void debug_hrtimer_init(struct hrtimer *timer) @@ -440,8 +480,7 @@ static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) debug_object_init_on_stack(timer, &hrtimer_debug_descr); } -static inline void debug_hrtimer_activate(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } @@ -451,6 +490,11 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer) debug_object_deactivate(timer, &hrtimer_debug_descr); } +static inline void debug_hrtimer_assert_init(struct hrtimer *timer) +{ + debug_object_assert_init(timer, &hrtimer_debug_descr); +} + void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); @@ -461,9 +505,9 @@ EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); static inline void debug_hrtimer_init(struct hrtimer *timer) { } static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } -static inline void debug_hrtimer_activate(struct hrtimer *timer, - enum hrtimer_mode mode) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } +static inline void debug_hrtimer_assert_init(struct hrtimer *timer) { } #endif static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) @@ -479,80 +523,80 @@ static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid trace_hrtimer_setup(timer, clockid, mode); } -static inline void debug_activate(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed) { debug_hrtimer_activate(timer, mode); - trace_hrtimer_start(timer, mode); + trace_hrtimer_start(timer, mode, was_armed); } -static inline void debug_deactivate(struct hrtimer *timer) -{ - debug_hrtimer_deactivate(timer); - trace_hrtimer_cancel(timer); -} +#define for_each_active_base(base, cpu_base, active) \ + for (unsigned int idx = ffs(active); idx--; idx = ffs((active))) \ + for (bool done = false; !done; active &= ~(1U << idx)) \ + for (base = &cpu_base->clock_base[idx]; !done; done = true) -static struct hrtimer_clock_base * -__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) +#define hrtimer_from_timerqueue_node(_n) container_of_const(_n, struct hrtimer, node) + +#if defined(CONFIG_NO_HZ_COMMON) +/* + * Same as hrtimer_bases_next_event() below, but skips the excluded timer and + * does not update cpu_base->next_timer/expires. + */ +static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_base, + const struct hrtimer *exclude, + unsigned int active, ktime_t expires_next) { - unsigned int idx; + struct hrtimer_clock_base *base; + ktime_t expires; - if (!*active) - return NULL; + lockdep_assert_held(&cpu_base->lock); - idx = __ffs(*active); - *active &= ~(1U << idx); + for_each_active_base(base, cpu_base, active) { + expires = ktime_sub(base->expires_next, base->offset); + if (expires >= expires_next) + continue; + + /* + * If the excluded timer is the first on this base evaluate the + * next timer. + */ + struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active); - return &cpu_base->clock_base[idx]; + if (unlikely(&exclude->node == node)) { + node = timerqueue_linked_next(node); + if (!node) + continue; + expires = ktime_sub(node->expires, base->offset); + if (expires >= expires_next) + continue; + } + expires_next = expires; + } + /* If base->offset changed, the result might be negative */ + return max(expires_next, 0); } +#endif -#define for_each_active_base(base, cpu_base, active) \ - while ((base = __next_base((cpu_base), &(active)))) +static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base) +{ + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); -static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, - const struct hrtimer *exclude, - unsigned int active, - ktime_t expires_next) + return hrtimer_from_timerqueue_node(next); +} + +/* Find the base with the earliest expiry */ +static void hrtimer_bases_first(struct hrtimer_cpu_base *cpu_base,unsigned int active, + ktime_t *expires_next, struct hrtimer **next_timer) { struct hrtimer_clock_base *base; ktime_t expires; for_each_active_base(base, cpu_base, active) { - struct timerqueue_node *next; - struct hrtimer *timer; - - next = timerqueue_getnext(&base->active); - timer = container_of(next, struct hrtimer, node); - if (timer == exclude) { - /* Get to the next timer in the queue. */ - next = timerqueue_iterate_next(next); - if (!next) - continue; - - timer = container_of(next, struct hrtimer, node); - } - expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (expires < expires_next) { - expires_next = expires; - - /* Skip cpu_base update if a timer is being excluded. */ - if (exclude) - continue; - - if (timer->is_soft) - cpu_base->softirq_next_timer = timer; - else - cpu_base->next_timer = timer; + expires = ktime_sub(base->expires_next, base->offset); + if (expires < *expires_next) { + *expires_next = expires; + *next_timer = clock_base_next_timer(base); } } - /* - * clock_was_set() might have changed base->offset of any of - * the clock bases so the result might be negative. Fix it up - * to prevent a false positive in clockevents_program_event(). - */ - if (expires_next < 0) - expires_next = 0; - return expires_next; } /* @@ -575,30 +619,28 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, * - HRTIMER_ACTIVE_SOFT, or * - HRTIMER_ACTIVE_HARD. */ -static ktime_t -__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) +static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) { - unsigned int active; struct hrtimer *next_timer = NULL; ktime_t expires_next = KTIME_MAX; + unsigned int active; + + lockdep_assert_held(&cpu_base->lock); if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; - cpu_base->softirq_next_timer = NULL; - expires_next = __hrtimer_next_event_base(cpu_base, NULL, - active, KTIME_MAX); - - next_timer = cpu_base->softirq_next_timer; + if (active) + hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer); + cpu_base->softirq_next_timer = next_timer; } if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + if (active) + hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer); cpu_base->next_timer = next_timer; - expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, - expires_next); } - - return expires_next; + return max(expires_next, 0); } static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) @@ -638,8 +680,8 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, - offs_real, offs_boot, offs_tai); + ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, + offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; @@ -649,7 +691,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) } /* - * Is the high resolution mode active ? + * Is the high resolution mode active in the CPU base. This cannot use the + * static key as the CPUs are switched to high resolution mode + * asynchronously. */ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { @@ -657,8 +701,13 @@ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) cpu_base->hres_active : 0; } -static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, - struct hrtimer *next_timer, +static inline void hrtimer_rearm_event(ktime_t expires_next, bool deferred) +{ + trace_hrtimer_rearm(expires_next, deferred); + tick_program_event(expires_next, 1); +} + +static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) { cpu_base->expires_next = expires_next; @@ -683,20 +732,13 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; - tick_program_event(expires_next, 1); + hrtimer_rearm_event(expires_next, false); } -/* - * Reprogram the event source with checking both queues for the - * next event - * Called with interrupts disabled and base->lock held - */ -static void -hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +/* Reprogram the event source with a evaluation of all clock bases */ +static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, bool skip_equal) { - ktime_t expires_next; - - expires_next = hrtimer_update_next_event(cpu_base); + ktime_t expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; @@ -707,57 +749,49 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS -/* - * High resolution timer enabled ? - */ +/* High resolution timer enabled ? */ static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); -/* - * Enable / Disable high resolution mode - */ +/* Enable / Disable high resolution mode */ static int __init setup_hrtimer_hres(char *str) { return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } - __setup("highres=", setup_hrtimer_hres); -/* - * hrtimer_high_res_enabled - query, if the highres mode is enabled - */ -static inline int hrtimer_is_hres_enabled(void) +/* hrtimer_high_res_enabled - query, if the highres mode is enabled */ +static inline bool hrtimer_is_hres_enabled(void) { return hrtimer_hres_enabled; } -/* - * Switch to high resolution mode - */ +/* Switch to high resolution mode */ static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { - pr_warn("Could not switch to high resolution mode on CPU %u\n", - base->cpu); + pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } - base->hres_active = 1; + base->hres_active = true; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(true); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); + hrtimer_schedule_hres_work(); } #else -static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline bool hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ + /* * Retrigger next event is called after clock was set with interrupts * disabled through an SMP function call or directly from low level @@ -792,13 +826,12 @@ static void retrigger_next_event(void *arg) * In periodic low resolution mode, the next softirq expiration * must also be updated. */ - raw_spin_lock(&base->lock); + guard(raw_spinlock)(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) - hrtimer_force_reprogram(base, 0); + hrtimer_force_reprogram(base, /* skip_equal */ false); else hrtimer_update_next_event(base); - raw_spin_unlock(&base->lock); } /* @@ -812,10 +845,11 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = timer->base; - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + ktime_t expires = hrtimer_get_expires(timer); - WARN_ON_ONCE(hrtimer_get_expires(timer) < 0); + WARN_ON_ONCE(expires < 0); + expires = ktime_sub(expires, base->offset); /* * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Set it to 0. @@ -842,8 +876,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) timer_cpu_base->softirq_next_timer = timer; timer_cpu_base->softirq_expires_next = expires; - if (!ktime_before(expires, timer_cpu_base->expires_next) || - !reprogram) + if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) return; } @@ -857,11 +890,8 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (expires >= cpu_base->expires_next) return; - /* - * If the hrtimer interrupt is running, then it will reevaluate the - * clock bases and reprogram the clock event device. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending skip reprogramming the device */ + if (cpu_base->deferred_rearm) return; cpu_base->next_timer = timer; @@ -869,8 +899,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) __hrtimer_reprogram(cpu_base, timer, expires); } -static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, - unsigned int active) +static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active) { struct hrtimer_clock_base *base; unsigned int seq; @@ -896,13 +925,11 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, if (seq == cpu_base->clock_was_set_seq) return false; - /* - * If the remote CPU is currently handling an hrtimer interrupt, it - * will reevaluate the first expiring timer of all clock bases - * before reprogramming. Nothing to do here. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending the remote CPU will take care of it */ + if (cpu_base->deferred_rearm) { + cpu_base->deferred_needs_update = true; return false; + } /* * Walk the affected clock bases and check whether the first expiring @@ -913,9 +940,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { - struct timerqueue_node *next; + struct timerqueue_linked_node *next; - next = timerqueue_getnext(&base->active); + next = timerqueue_linked_first(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; @@ -947,11 +974,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, */ void clock_was_set(unsigned int bases) { - struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; - int cpu; - if (!hrtimer_hres_active(cpu_base) && !tick_nohz_is_active()) + if (!hrtimer_highres_enabled() && !tick_nohz_is_active()) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -960,23 +985,19 @@ void clock_was_set(unsigned int bases) } /* Avoid interrupting CPUs if possible */ - cpus_read_lock(); - for_each_online_cpu(cpu) { - unsigned long flags; - - cpu_base = &per_cpu(hrtimer_bases, cpu); - raw_spin_lock_irqsave(&cpu_base->lock, flags); + scoped_guard(cpus_read_lock) { + int cpu; - if (update_needs_ipi(cpu_base, bases)) - cpumask_set_cpu(cpu, mask); + for_each_online_cpu(cpu) { + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + guard(raw_spinlock_irqsave)(&cpu_base->lock); + if (update_needs_ipi(cpu_base, bases)) + cpumask_set_cpu(cpu, mask); + } + scoped_guard(preempt) + smp_call_function_many(mask, retrigger_next_event, NULL, 1); } - - preempt_disable(); - smp_call_function_many(mask, retrigger_next_event, NULL, 1); - preempt_enable(); - cpus_read_unlock(); free_cpumask_var(mask); out_timerfd: @@ -1011,11 +1032,8 @@ void hrtimers_resume_local(void) retrigger_next_event(NULL); } -/* - * Counterpart to lock_hrtimer_base above: - */ -static inline -void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +/* Counterpart to lock_hrtimer_base above */ +static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __releases(&timer->base->cpu_base->lock) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); @@ -1032,7 +1050,7 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * .. note:: * This only updates the timer expiry value and does not requeue the timer. * - * There is also a variant of the function hrtimer_forward_now(). + * There is also a variant of this function: hrtimer_forward_now(). * * Context: Can be safely called from the callback function of @timer. If called * from other contexts @timer must neither be enqueued nor running the @@ -1042,15 +1060,15 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { - u64 orun = 1; ktime_t delta; + u64 orun = 1; delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta < 0) return 0; - if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + if (WARN_ON(timer->is_queued)) return 0; if (interval < hrtimer_resolution) @@ -1079,73 +1097,98 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * enqueue_hrtimer - internal function to (re)start a timer * * The timer is inserted in expiry order. Insertion into the - * red black tree is O(log(n)). Must hold the base lock. + * red black tree is O(log(n)). * * Returns true when the new timer is the leftmost timer in the tree. */ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - enum hrtimer_mode mode) + enum hrtimer_mode mode, bool was_armed) { - debug_activate(timer, mode); + lockdep_assert_held(&base->cpu_base->lock); + + debug_activate(timer, mode, was_armed); WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); + WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); + + if (!timerqueue_linked_add(&base->active, &timer->node)) + return false; + + base->expires_next = hrtimer_get_expires(timer); + return true; +} - return timerqueue_add(&base->active, &timer->node); +static inline void base_update_next_timer(struct hrtimer_clock_base *base) +{ + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); + + base->expires_next = next ? next->expires : KTIME_MAX; } /* * __remove_hrtimer - internal function to remove a timer * - * Caller must hold the base lock. - * * High resolution timer mode reprograms the clock event device when the * timer is the one which expires next. The caller can disable this by setting * reprogram to zero. This is useful, when the context does a reprogramming * anyway (e.g. timer interrupt) */ -static void __remove_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, - u8 newstate, int reprogram) +static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool newstate, bool reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; + bool was_first; - /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, newstate); - if (!(state & HRTIMER_STATE_ENQUEUED)) + lockdep_assert_held(&cpu_base->lock); + + if (!timer->is_queued) return; - if (!timerqueue_del(&base->active, &timer->node)) + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->is_queued, newstate); + + was_first = !timerqueue_linked_prev(&timer->node); + + if (!timerqueue_linked_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); + /* Nothing to update if this was not the first timer in the base */ + if (!was_first) + return; + + base_update_next_timer(base); + /* - * Note: If reprogram is false we do not update - * cpu_base->next_timer. This happens when we remove the first - * timer on a remote cpu. No harm as we never dereference - * cpu_base->next_timer. So the worst thing what can happen is - * an superfluous call to hrtimer_force_reprogram() on the - * remote cpu later on if the same timer gets enqueued again. + * If reprogram is false don't update cpu_base->next_timer and do not + * touch the clock event device. + * + * This happens when removing the first timer on a remote CPU, which + * will be handled by the remote CPU's interrupt. It also happens when + * a local timer is removed to be immediately restarted. That's handled + * at the call site. */ - if (reprogram && timer == cpu_base->next_timer) - hrtimer_force_reprogram(cpu_base, 1); + if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy) + return; + + if (cpu_base->deferred_rearm) + cpu_base->deferred_needs_update = true; + else + hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); } -/* - * remove hrtimer, called with base lock held - */ -static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - bool restart, bool keep_local) +static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool newstate) { - u8 state = timer->state; + lockdep_assert_held(&base->cpu_base->lock); - if (state & HRTIMER_STATE_ENQUEUED) { + if (timer->is_queued) { bool reprogram; + debug_hrtimer_deactivate(timer); + /* * Remove the timer and force reprogramming when high * resolution mode is active and the timer is on the current @@ -1154,24 +1197,81 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ - debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - /* - * If the timer is not restarted then reprogramming is - * required if the timer is local. If it is local and about - * to be restarted, avoid programming it twice (on removal - * and a moment later when it's requeued). - */ - if (!restart) - state = HRTIMER_STATE_INACTIVE; - else - reprogram &= !keep_local; + __remove_hrtimer(timer, base, newstate, reprogram); + return true; + } + return false; +} + +/* + * Update in place has to retrieve the expiry times of the neighbour nodes + * if they exist. That is cache line neutral because the dequeue/enqueue + * operation is going to need the same cache lines. But there is a big win + * when the dequeue/enqueue can be avoided because the RB tree does not + * have to be rebalanced twice. + */ +static inline bool +hrtimer_can_update_in_place(struct hrtimer *timer, struct hrtimer_clock_base *base, ktime_t expires) +{ + struct timerqueue_linked_node *next = timerqueue_linked_next(&timer->node); + struct timerqueue_linked_node *prev = timerqueue_linked_prev(&timer->node); + + /* If the new expiry goes behind the next timer, requeue is required */ + if (next && expires > next->expires) + return false; + + /* If this is the first timer, update in place */ + if (!prev) + return true; + + /* Update in place when it does not go ahead of the previous one */ + return expires >= prev->expires; +} + +static inline bool +remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base, + const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns) +{ + bool was_first = false; + + /* Remove it from the timer queue if active */ + if (timer->is_queued) { + was_first = !timerqueue_linked_prev(&timer->node); + + /* Try to update in place to avoid the de/enqueue dance */ + if (hrtimer_can_update_in_place(timer, base, expires)) { + hrtimer_set_expires_range_ns(timer, expires, delta_ns); + trace_hrtimer_start(timer, mode, true); + if (was_first) + base->expires_next = expires; + return was_first; + } - __remove_hrtimer(timer, base, state, reprogram); - return 1; + debug_hrtimer_deactivate(timer); + timerqueue_linked_del(&base->active, &timer->node); } - return 0; + + /* Set the new expiry time */ + hrtimer_set_expires_range_ns(timer, expires, delta_ns); + + debug_activate(timer, mode, timer->is_queued); + base->cpu_base->active_bases |= 1 << base->index; + + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); + + /* If it's the first expiring timer now or again, update base */ + if (timerqueue_linked_add(&base->active, &timer->node)) { + base->expires_next = expires; + return true; + } + + if (was_first) + base_update_next_timer(base); + + return false; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, @@ -1190,55 +1290,93 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, return tim; } -static void -hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) +static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { - ktime_t expires; - - /* - * Find the next SOFT expiration. - */ - expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + ktime_t expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* - * reprogramming needs to be triggered, even if the next soft - * hrtimer expires at the same time than the next hard + * Reprogramming needs to be triggered, even if the next soft + * hrtimer expires at the same time as the next hard * hrtimer. cpu_base->softirq_expires_next needs to be updated! */ if (expires == KTIME_MAX) return; /* - * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() - * cpu_base->*expires_next is only set by hrtimer_reprogram() + * cpu_base->next_timer is recomputed by __hrtimer_get_next_event() + * cpu_base->expires_next is only set by hrtimer_reprogram() */ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); } -static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - u64 delta_ns, const enum hrtimer_mode mode, - struct hrtimer_clock_base *base) +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned) +{ + if (static_branch_likely(&timers_migration_enabled)) { + /* + * If it is local and the first expiring timer keep it on the local + * CPU to optimize reprogramming of the clockevent device. Also + * avoid switch_hrtimer_base() overhead when local and pinned. + */ + if (!is_local) + return false; + if (is_first || is_pinned) + return true; + + /* Honour the NOHZ full restrictions */ + if (!housekeeping_cpu(smp_processor_id(), HK_TYPE_KERNEL_NOISE)) + return false; + + /* + * If the tick is not stopped or need_resched() is set, then + * there is no point in moving the timer somewhere else. + */ + return !tick_nohz_tick_stopped() || need_resched(); + } + return is_local; +} +#else +static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned) +{ + return is_local; +} +#endif + +static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool is_first, + bool is_pinned) +{ + /* If the timer is running the callback it has to stay on its CPU base. */ + if (unlikely(timer->base->running == timer)) + return true; + + return hrtimer_prefer_local(is_local, is_first, is_pinned); +} + +static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, + const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); - struct hrtimer_clock_base *new_base; - bool force_local, first; + bool is_pinned, first, was_first, keep_base = false; + struct hrtimer_cpu_base *cpu_base = base->cpu_base; - /* - * If the timer is on the local cpu base and is the first expiring - * timer then this might end up reprogramming the hardware twice - * (on removal and on enqueue). To avoid that by prevent the - * reprogram on removal, keep the timer local to the current CPU - * and enforce reprogramming after it is queued no matter whether - * it is the new first expiring timer again or not. - */ - force_local = base->cpu_base == this_cpu_base; - force_local &= base->cpu_base->next_timer == timer; + was_first = cpu_base->next_timer == timer; + is_pinned = !!(mode & HRTIMER_MODE_PINNED); /* - * Don't force local queuing if this enqueue happens on a unplugged - * CPU after hrtimer_cpu_dying() has been invoked. + * Don't keep it local if this enqueue happens on a unplugged CPU + * after hrtimer_cpu_dying() has been invoked. */ - force_local &= this_cpu_base->online; + if (likely(this_cpu_base->online)) { + bool is_local = cpu_base == this_cpu_base; + + keep_base = hrtimer_keep_base(timer, is_local, was_first, is_pinned); + } + + /* Calculate absolute expiry time for relative timers */ + if (mode & HRTIMER_MODE_REL) + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); + /* Compensate for low resolution granularity */ + tim = hrtimer_update_lowres(timer, tim, mode); /* * Remove an active timer from the queue. In case it is not queued @@ -1250,32 +1388,41 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * reprogramming later if it was the first expiring timer. This * avoids programming the underlying clock event twice (once at * removal and once after enqueue). + * + * @keep_base is also true if the timer callback is running on a + * remote CPU and for local pinned timers. */ - remove_hrtimer(timer, base, true, force_local); + if (likely(keep_base)) { + first = remove_and_enqueue_same_base(timer, base, mode, tim, delta_ns); + } else { + /* Keep the ENQUEUED state in case it is queued */ + bool was_armed = remove_hrtimer(timer, base, HRTIMER_STATE_ENQUEUED); - if (mode & HRTIMER_MODE_REL) - tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); + hrtimer_set_expires_range_ns(timer, tim, delta_ns); - tim = hrtimer_update_lowres(timer, tim, mode); + /* Switch the timer base, if necessary: */ + base = switch_hrtimer_base(timer, base, is_pinned); + cpu_base = base->cpu_base; - hrtimer_set_expires_range_ns(timer, tim, delta_ns); + first = enqueue_hrtimer(timer, base, mode, was_armed); + } - /* Switch the timer base, if necessary: */ - if (!force_local) { - new_base = switch_hrtimer_base(timer, base, - mode & HRTIMER_MODE_PINNED); - } else { - new_base = base; + /* If a deferred rearm is pending skip reprogramming the device */ + if (cpu_base->deferred_rearm) { + cpu_base->deferred_needs_update = true; + return false; } - first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) { + if (!was_first || cpu_base != this_cpu_base) { /* - * If the current CPU base is online, then the timer is - * never queued on a remote CPU if it would be the first - * expiring timer there. + * If the current CPU base is online, then the timer is never + * queued on a remote CPU if it would be the first expiring + * timer there unless the timer callback is currently executed + * on the remote CPU. In the latter case the remote CPU will + * re-evaluate the first expiring timer after completing the + * callbacks. */ - if (hrtimer_base_is_online(this_cpu_base)) + if (likely(hrtimer_base_is_online(this_cpu_base))) return first; /* @@ -1283,21 +1430,33 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * already offline. If the timer is the first to expire, * kick the remote CPU to reprogram the clock event. */ - if (first) { - struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + if (first) + smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd); + return false; + } - smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); - } - return 0; + /* + * Special case for the HRTICK timer. It is frequently rearmed and most + * of the time moves the expiry into the future. That's expensive in + * virtual machines and it's better to take the pointless already armed + * interrupt than reprogramming the hardware on every context switch. + * + * If the new expiry is before the armed time, then reprogramming is + * required. + */ + if (timer->is_lazy) { + if (cpu_base->expires_next <= hrtimer_get_expires(timer)) + return false; } /* - * Timer was forced to stay on the current CPU to avoid - * reprogramming on removal and enqueue. Force reprogram the - * hardware by evaluating the new first expiring timer. + * Timer was the first expiring timer and forced to stay on the + * current CPU to avoid reprogramming on removal and enqueue. Force + * reprogram the hardware by evaluating the new first expiring + * timer. */ - hrtimer_force_reprogram(new_base->cpu_base, 1); - return 0; + hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); + return false; } /** @@ -1309,12 +1468,14 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ -void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - u64 delta_ns, const enum hrtimer_mode mode) +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, + const enum hrtimer_mode mode) { struct hrtimer_clock_base *base; unsigned long flags; + debug_hrtimer_assert_init(timer); + /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard @@ -1362,8 +1523,11 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); - if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false, false); + if (!hrtimer_callback_running(timer)) { + ret = remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE); + if (ret) + trace_hrtimer_cancel(timer); + } unlock_hrtimer_base(timer, &flags); @@ -1397,8 +1561,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) * the timer callback to finish. Drop expiry_lock and reacquire it. That * allows the waiter to acquire the lock and make progress. */ -static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, - unsigned long flags) +static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) { if (atomic_read(&cpu_base->timer_waiters)) { raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1463,14 +1626,10 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer) spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); } #else -static inline void -hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } -static inline void -hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } -static inline void -hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } -static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, - unsigned long flags) { } +static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long fl) { } #endif /** @@ -1526,15 +1685,11 @@ u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_base->lock, flags); + guard(raw_spinlock_irqsave)(&cpu_base->lock); if (!hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - return expires; } @@ -1549,26 +1704,20 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_base->lock, flags); - - if (hrtimer_hres_active(cpu_base)) { - unsigned int active; + unsigned int active; - if (!cpu_base->softirq_activated) { - active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; - expires = __hrtimer_next_event_base(cpu_base, exclude, - active, KTIME_MAX); - } - active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; - expires = __hrtimer_next_event_base(cpu_base, exclude, active, - expires); - } + guard(raw_spinlock_irqsave)(&cpu_base->lock); + if (!hrtimer_hres_active(cpu_base)) + return expires; - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + if (active && !cpu_base->softirq_activated) + expires = hrtimer_bases_next_event_without(cpu_base, exclude, active, KTIME_MAX); - return expires; + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + if (!active) + return expires; + return hrtimer_bases_next_event_without(cpu_base, exclude, active, expires); } #endif @@ -1612,8 +1761,7 @@ ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); -static void __hrtimer_setup(struct hrtimer *timer, - enum hrtimer_restart (*function)(struct hrtimer *), +static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); @@ -1645,13 +1793,14 @@ static void __hrtimer_setup(struct hrtimer *timer, base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); + timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM); timer->base = &cpu_base->clock_base[base]; - timerqueue_init(&timer->node); + timerqueue_linked_init(&timer->node); - if (WARN_ON_ONCE(!function)) + if (WARN_ON_ONCE(!fn)) ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; else - ACCESS_PRIVATE(timer, function) = function; + ACCESS_PRIVATE(timer, function) = fn; } /** @@ -1710,12 +1859,10 @@ bool hrtimer_active(const struct hrtimer *timer) base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); - if (timer->state != HRTIMER_STATE_INACTIVE || - base->running == timer) + if (timer->is_queued || base->running == timer) return true; - } while (read_seqcount_retry(&base->seq, seq) || - base != READ_ONCE(timer->base)); + } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); return false; } @@ -1729,7 +1876,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * - * On the read side we ensure we observe timer->state and cpu_base->running + * On the read side we ensure we observe timer->is_queued and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. @@ -1738,11 +1885,9 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ - -static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, - struct hrtimer_clock_base *base, - struct hrtimer *timer, ktime_t *now, - unsigned long flags) __must_hold(&cpu_base->lock) +static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, + struct hrtimer *timer, ktime_t now, unsigned long flags) + __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); bool expires_in_hardirq; @@ -1754,15 +1899,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, base->running = timer; /* - * Separate the ->running assignment from the ->state assignment. + * Separate the ->running assignment from the ->is_queued assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && - * timer->state == INACTIVE. + * timer->is_queued == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); - __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, false); fn = ACCESS_PRIVATE(timer, function); /* @@ -1797,16 +1942,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ - if (restart != HRTIMER_NORESTART && - !(timer->state & HRTIMER_STATE_ENQUEUED)) - enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); + if (restart == HRTIMER_RESTART && !timer->is_queued) + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false); /* - * Separate the ->running assignment from the ->state assignment. + * Separate the ->running assignment from the ->is_queued assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && - * timer->state == INACTIVE. + * timer->is_queued == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); @@ -1814,23 +1958,24 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, base->running = NULL; } +static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base) +{ + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); + + return next ? hrtimer_from_timerqueue_node(next) : NULL; +} + static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, unsigned long flags, unsigned int active_mask) { - struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases & active_mask; + struct hrtimer_clock_base *base; for_each_active_base(base, cpu_base, active) { - struct timerqueue_node *node; - ktime_t basenow; - - basenow = ktime_add(now, base->offset); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); + ktime_t basenow = ktime_add(now, base->offset); + struct hrtimer *timer; + while ((timer = clock_base_next_timer(base))) { /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the @@ -1846,7 +1991,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, if (basenow < hrtimer_get_softexpires(timer)) break; - __run_hrtimer(cpu_base, base, timer, &basenow, flags); + __run_hrtimer(cpu_base, base, timer, basenow, flags); if (active_mask == HRTIMER_ACTIVE_SOFT) hrtimer_sync_wait_running(cpu_base, flags); } @@ -1865,7 +2010,7 @@ static __latent_entropy void hrtimer_run_softirq(void) now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); - cpu_base->softirq_activated = 0; + cpu_base->softirq_activated = false; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1875,6 +2020,63 @@ static __latent_entropy void hrtimer_run_softirq(void) #ifdef CONFIG_HIGH_RES_TIMERS /* + * Very similar to hrtimer_force_reprogram(), except it deals with + * deferred_rearm and hang_detected. + */ +static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred) +{ + cpu_base->expires_next = expires_next; + cpu_base->deferred_rearm = false; + + if (unlikely(cpu_base->hang_detected)) { + /* + * Give the system a chance to do something else than looping + * on hrtimer interrupts. + */ + expires_next = ktime_add_ns(ktime_get(), + min(100 * NSEC_PER_MSEC, cpu_base->max_hang_time)); + } + hrtimer_rearm_event(expires_next, deferred); +} + +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +void __hrtimer_rearm_deferred(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires_next; + + if (!cpu_base->deferred_rearm) + return; + + guard(raw_spinlock)(&cpu_base->lock); + if (cpu_base->deferred_needs_update) { + hrtimer_update_base(cpu_base); + expires_next = hrtimer_update_next_event(cpu_base); + } else { + /* No timer added/removed. Use the cached value */ + expires_next = cpu_base->deferred_expires_next; + } + hrtimer_rearm(cpu_base, expires_next, true); +} + +static __always_inline void +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) +{ + /* hrtimer_interrupt() just re-evaluated the first expiring timer */ + cpu_base->deferred_needs_update = false; + /* Cache the expiry time */ + cpu_base->deferred_expires_next = expires_next; + set_thread_flag(TIF_HRTIMER_REARM); +} +#else /* CONFIG_HRTIMER_REARM_DEFERRED */ +static __always_inline void +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) +{ + hrtimer_rearm(cpu_base, expires_next, false); +} +#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ + +/* * High resolution timer interrupt * Called with interrupts disabled */ @@ -1888,86 +2090,55 @@ void hrtimer_interrupt(struct clock_event_device *dev) BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; + dev->next_event_forced = 0; raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: - cpu_base->in_hrtirq = 1; + cpu_base->deferred_rearm = true; /* - * We set expires_next to KTIME_MAX here with cpu_base->lock - * held to prevent that a timer is enqueued in our queue via - * the migration code. This does not affect enqueueing of - * timers which run their callback and need to be requeued on - * this CPU. + * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue + * timers while __hrtimer_run_queues() is expiring the clock bases. + * Timers which are re/enqueued on the local CPU are not affected by + * this. */ cpu_base->expires_next = KTIME_MAX; if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->softirq_activated = 1; + cpu_base->softirq_activated = true; raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); - /* Reevaluate the clock bases for the [soft] next expiry */ - expires_next = hrtimer_update_next_event(cpu_base); - /* - * Store the new expiry value so the migration code can verify - * against it. - */ - cpu_base->expires_next = expires_next; - cpu_base->in_hrtirq = 0; - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - - /* Reprogramming necessary ? */ - if (!tick_program_event(expires_next, 0)) { - cpu_base->hang_detected = 0; - return; - } - /* * The next timer was already expired due to: * - tracing * - long lasting callbacks * - being scheduled away when running in a VM * - * We need to prevent that we loop forever in the hrtimer - * interrupt routine. We give it 3 attempts to avoid - * overreacting on some spurious event. - * - * Acquire base lock for updating the offsets and retrieving - * the current time. + * We need to prevent that we loop forever in the hrtiner interrupt + * routine. We give it 3 attempts to avoid overreacting on some + * spurious event. */ - raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); - cpu_base->nr_retries++; - if (++retries < 3) - goto retry; - /* - * Give the system a chance to do something else than looping - * here. We stored the entry time, so we know exactly how long - * we spent here. We schedule the next event this amount of - * time away. - */ - cpu_base->nr_hangs++; - cpu_base->hang_detected = 1; - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + expires_next = hrtimer_update_next_event(cpu_base); + cpu_base->hang_detected = false; + if (expires_next < now) { + if (++retries < 3) + goto retry; + + delta = ktime_sub(now, entry_time); + cpu_base->max_hang_time = max_t(unsigned int, cpu_base->max_hang_time, delta); + cpu_base->nr_hangs++; + cpu_base->hang_detected = true; + } - delta = ktime_sub(now, entry_time); - if ((unsigned int)delta > cpu_base->max_hang_time) - cpu_base->max_hang_time = (unsigned int) delta; - /* - * Limit it to a sensible value as we enforce a longer - * delay. Give the CPU at least 100ms to catch up. - */ - if (delta > 100 * NSEC_PER_MSEC) - expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); - else - expires_next = ktime_add(now, delta); - tick_program_event(expires_next, 1); - pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); + hrtimer_interrupt_rearm(cpu_base, expires_next); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } + #endif /* !CONFIG_HIGH_RES_TIMERS */ /* @@ -1999,7 +2170,7 @@ void hrtimer_run_queues(void) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->softirq_activated = 1; + cpu_base->softirq_activated = true; raise_timer_softirq(HRTIMER_SOFTIRQ); } @@ -2012,8 +2183,7 @@ void hrtimer_run_queues(void) */ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { - struct hrtimer_sleeper *t = - container_of(timer, struct hrtimer_sleeper, timer); + struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); struct task_struct *task = t->task; t->task = NULL; @@ -2031,8 +2201,7 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) */ -void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, - enum hrtimer_mode mode) +void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) { /* * Make the enqueue delivery mode check work on RT. If the sleeper @@ -2048,8 +2217,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); -static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) +static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly @@ -2085,8 +2254,8 @@ static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, * @clock_id: the clock to be used * @mode: timer mode abs/rel */ -void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) +void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode) { debug_setup_on_stack(&sl->timer, clock_id, mode); __hrtimer_setup_sleeper(sl, clock_id, mode); @@ -2159,12 +2328,11 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) return ret; } -long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, - const clockid_t clockid) +long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; - int ret = 0; + int ret; hrtimer_setup_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); @@ -2203,8 +2371,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, - CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif @@ -2212,7 +2379,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, - struct old_timespec32 __user *, rmtp) + struct old_timespec32 __user *, rmtp) { struct timespec64 tu; @@ -2225,8 +2392,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, - CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif @@ -2236,14 +2402,13 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); - int i; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); - timerqueue_init_head(&clock_b->active); + timerqueue_linked_init_head(&clock_b->active); } cpu_base->cpu = cpu; @@ -2257,13 +2422,14 @@ int hrtimers_cpu_starting(unsigned int cpu) /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; - cpu_base->hres_active = 0; - cpu_base->hang_detected = 0; + cpu_base->hres_active = false; + cpu_base->hang_detected = false; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->online = 1; + cpu_base->softirq_activated = false; + cpu_base->online = true; return 0; } @@ -2272,20 +2438,20 @@ int hrtimers_cpu_starting(unsigned int cpu) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { + struct timerqueue_linked_node *node; struct hrtimer *timer; - struct timerqueue_node *node; - while ((node = timerqueue_getnext(&old_base->active))) { - timer = container_of(node, struct hrtimer, node); + while ((node = timerqueue_linked_first(&old_base->active))) { + timer = hrtimer_from_timerqueue_node(node); BUG_ON(hrtimer_callback_running(timer)); - debug_deactivate(timer); + debug_hrtimer_deactivate(timer); /* * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ - __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, false); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -2295,13 +2461,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * sort out already expired timers and reprogram the * event device. */ - enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true); } } int hrtimers_cpu_dying(unsigned int dying_cpu) { - int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + int ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); struct hrtimer_cpu_base *old_base, *new_base; old_base = this_cpu_ptr(&hrtimer_bases); @@ -2314,16 +2480,14 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) raw_spin_lock(&old_base->lock); raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); - } + for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); - old_base->online = 0; + old_base->online = false; raw_spin_unlock(&old_base->lock); return 0; diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a5c7d15fce72..1c954f330dfe 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -32,7 +32,6 @@ static u64 jiffies_read(struct clocksource *cs) static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ - .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = jiffies_read, .mask = CLOCKSOURCE_MASK(32), .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ @@ -256,8 +255,6 @@ EXPORT_SYMBOL(proc_dointvec_jiffies); int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos) { - if (SYSCTL_USER_TO_KERN(dir) && USER_HZ < HZ) - return -EINVAL; return proc_dointvec_conv(table, dir, buffer, lenp, ppos, do_proc_int_conv_userhz_jiffies); } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 652744e00eb4..4bca3f78c8ea 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -18,8 +18,9 @@ #include <linux/cred.h> #include <linux/err.h> #include <linux/mm.h> +#include <linux/cleanup.h> -#include <vdso/datapage.h> +#include "namespace_internal.h" ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) @@ -93,8 +94,8 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!ns->vvar_page) + err = timens_vdso_alloc_vvar_page(ns); + if (err) goto fail_free; err = ns_common_init(ns); @@ -109,7 +110,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, return ns; fail_free_page: - __free_page(ns->vvar_page); + timens_vdso_free_vvar_page(ns); fail_free: kfree(ns); fail_dec: @@ -138,117 +139,7 @@ struct time_namespace *copy_time_ns(u64 flags, return clone_time_ns(user_ns, old_ns); } -static struct timens_offset offset_from_ts(struct timespec64 off) -{ - struct timens_offset ret; - - ret.sec = off.tv_sec; - ret.nsec = off.tv_nsec; - - return ret; -} - -/* - * A time namespace VVAR page has the same layout as the VVAR page which - * contains the system wide VDSO data. - * - * For a normal task the VVAR pages are installed in the normal ordering: - * VVAR - * PVCLOCK - * HVCLOCK - * TIMENS <- Not really required - * - * Now for a timens task the pages are installed in the following order: - * TIMENS - * PVCLOCK - * HVCLOCK - * VVAR - * - * The check for vdso_clock->clock_mode is in the unlikely path of - * the seq begin magic. So for the non-timens case most of the time - * 'seq' is even, so the branch is not taken. - * - * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check - * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the - * update to finish and for 'seq' to become even anyway. - * - * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which - * enforces the time namespace handling path. - */ -static void timens_setup_vdso_clock_data(struct vdso_clock *vc, - struct time_namespace *ns) -{ - struct timens_offset *offset = vc->offset; - struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); - struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); - - vc->seq = 1; - vc->clock_mode = VDSO_CLOCKMODE_TIMENS; - offset[CLOCK_MONOTONIC] = monotonic; - offset[CLOCK_MONOTONIC_RAW] = monotonic; - offset[CLOCK_MONOTONIC_COARSE] = monotonic; - offset[CLOCK_BOOTTIME] = boottime; - offset[CLOCK_BOOTTIME_ALARM] = boottime; -} - -struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - if (likely(vma->vm_mm == current->mm)) - return current->nsproxy->time_ns->vvar_page; - - /* - * VM_PFNMAP | VM_IO protect .fault() handler from being called - * through interfaces like /proc/$pid/mem or - * process_vm_{readv,writev}() as long as there's no .access() - * in special_mapping_vmops(). - * For more details check_vma_flags() and __access_remote_vm() - */ - - WARN(1, "vvar_page accessed remotely"); - - return NULL; -} - -/* - * Protects possibly multiple offsets writers racing each other - * and tasks entering the namespace. - */ -static DEFINE_MUTEX(offset_lock); - -static void timens_set_vvar_page(struct task_struct *task, - struct time_namespace *ns) -{ - struct vdso_time_data *vdata; - struct vdso_clock *vc; - unsigned int i; - - if (ns == &init_time_ns) - return; - - /* Fast-path, taken by every task in namespace except the first. */ - if (likely(ns->frozen_offsets)) - return; - - mutex_lock(&offset_lock); - /* Nothing to-do: vvar_page has been already initialized. */ - if (ns->frozen_offsets) - goto out; - - ns->frozen_offsets = true; - vdata = page_address(ns->vvar_page); - vc = vdata->clock_data; - - for (i = 0; i < CS_BASES; i++) - timens_setup_vdso_clock_data(&vc[i], ns); - - if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { - for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) - timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); - } - -out: - mutex_unlock(&offset_lock); -} +DEFINE_MUTEX(timens_offset_lock); void free_time_ns(struct time_namespace *ns) { @@ -256,41 +147,39 @@ void free_time_ns(struct time_namespace *ns) dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_common_free(ns); - __free_page(ns->vvar_page); + timens_vdso_free_vvar_page(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) { - struct time_namespace *ns = NULL; + struct time_namespace *ns; struct nsproxy *nsproxy; - task_lock(task); + guard(task_lock)(task); nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->time_ns; - get_time_ns(ns); - } - task_unlock(task); + if (!nsproxy) + return NULL; - return ns ? &ns->ns : NULL; + ns = nsproxy->time_ns; + get_time_ns(ns); + return &ns->ns; } static struct ns_common *timens_for_children_get(struct task_struct *task) { - struct time_namespace *ns = NULL; + struct time_namespace *ns; struct nsproxy *nsproxy; - task_lock(task); + guard(task_lock)(task); nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->time_ns_for_children; - get_time_ns(ns); - } - task_unlock(task); + if (!nsproxy) + return NULL; - return ns ? &ns->ns : NULL; + ns = nsproxy->time_ns_for_children; + get_time_ns(ns); + return &ns->ns; } static void timens_put(struct ns_common *ns) @@ -298,12 +187,6 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } -void timens_commit(struct task_struct *tsk, struct time_namespace *ns) -{ - timens_set_vvar_page(tsk, ns); - vdso_join_timens(tsk, ns); -} - static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; @@ -367,36 +250,33 @@ static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) { - struct ns_common *ns; - struct time_namespace *time_ns; + struct time_namespace *time_ns __free(time_ns) = NULL; + struct ns_common *ns = timens_for_children_get(p); - ns = timens_for_children_get(p); if (!ns) return; + time_ns = to_time_ns(ns); show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); - put_time_ns(time_ns); } int proc_timens_set_offset(struct file *file, struct task_struct *p, struct proc_timens_offset *offsets, int noffsets) { - struct ns_common *ns; - struct time_namespace *time_ns; + struct time_namespace *time_ns __free(time_ns) = NULL; + struct ns_common *ns = timens_for_children_get(p); struct timespec64 tp; - int i, err; + int i; - ns = timens_for_children_get(p); if (!ns) return -ESRCH; + time_ns = to_time_ns(ns); - if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { - put_time_ns(time_ns); + if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) return -EPERM; - } for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; @@ -409,15 +289,12 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, ktime_get_boottime_ts64(&tp); break; default: - err = -EINVAL; - goto out; + return -EINVAL; } - err = -ERANGE; - if (off->val.tv_sec > KTIME_SEC_MAX || off->val.tv_sec < -KTIME_SEC_MAX) - goto out; + return -ERANGE; tp = timespec64_add(tp, off->val); /* @@ -425,16 +302,13 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, * still unreachable. */ if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) - goto out; + return -ERANGE; } - mutex_lock(&offset_lock); - if (time_ns->frozen_offsets) { - err = -EACCES; - goto out_unlock; - } + guard(mutex)(&timens_offset_lock); + if (time_ns->frozen_offsets) + return -EACCES; - err = 0; /* Don't report errors after this line */ for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; @@ -452,12 +326,7 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, *offset = off->val; } -out_unlock: - mutex_unlock(&offset_lock); -out: - put_time_ns(time_ns); - - return err; + return 0; } const struct proc_ns_operations timens_operations = { diff --git a/kernel/time/namespace_internal.h b/kernel/time/namespace_internal.h new file mode 100644 index 000000000000..b37ba179f43b --- /dev/null +++ b/kernel/time/namespace_internal.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TIME_NAMESPACE_INTERNAL_H +#define _TIME_NAMESPACE_INTERNAL_H + +#include <linux/mutex.h> + +struct time_namespace; + +/* + * Protects possibly multiple offsets writers racing each other + * and tasks entering the namespace. + */ +extern struct mutex timens_offset_lock; + +#ifdef CONFIG_TIME_NS_VDSO +int timens_vdso_alloc_vvar_page(struct time_namespace *ns); +void timens_vdso_free_vvar_page(struct time_namespace *ns); +#else /* !CONFIG_TIME_NS_VDSO */ +static inline int timens_vdso_alloc_vvar_page(struct time_namespace *ns) +{ + return 0; +} +static inline void timens_vdso_free_vvar_page(struct time_namespace *ns) +{ +} +#endif /* CONFIG_TIME_NS_VDSO */ + +#endif /* _TIME_NAMESPACE_INTERNAL_H */ diff --git a/kernel/time/namespace_vdso.c b/kernel/time/namespace_vdso.c new file mode 100644 index 000000000000..0d74d160eec9 --- /dev/null +++ b/kernel/time/namespace_vdso.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: Andrei Vagin <avagin@openvz.org> + * Author: Dmitry Safonov <dima@arista.com> + */ + +#include <linux/cleanup.h> +#include <linux/mm.h> +#include <linux/time_namespace.h> +#include <linux/time.h> +#include <linux/vdso_datastore.h> + +#include <vdso/clocksource.h> +#include <vdso/datapage.h> + +#include "namespace_internal.h" + +static struct timens_offset offset_from_ts(struct timespec64 off) +{ + struct timens_offset ret; + + ret.sec = off.tv_sec; + ret.nsec = off.tv_nsec; + + return ret; +} + +/* + * A time namespace VVAR page has the same layout as the VVAR page which + * contains the system wide VDSO data. + * + * For a normal task the VVAR pages are installed in the normal ordering: + * VVAR + * PVCLOCK + * HVCLOCK + * TIMENS <- Not really required + * + * Now for a timens task the pages are installed in the following order: + * TIMENS + * PVCLOCK + * HVCLOCK + * VVAR + * + * The check for vdso_clock->clock_mode is in the unlikely path of + * the seq begin magic. So for the non-timens case most of the time + * 'seq' is even, so the branch is not taken. + * + * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check + * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the + * update to finish and for 'seq' to become even anyway. + * + * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * enforces the time namespace handling path. + */ +static void timens_setup_vdso_clock_data(struct vdso_clock *vc, + struct time_namespace *ns) +{ + struct timens_offset *offset = vc->offset; + struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); + struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); + + vc->seq = 1; + vc->clock_mode = VDSO_CLOCKMODE_TIMENS; + offset[CLOCK_MONOTONIC] = monotonic; + offset[CLOCK_MONOTONIC_RAW] = monotonic; + offset[CLOCK_MONOTONIC_COARSE] = monotonic; + offset[CLOCK_BOOTTIME] = boottime; + offset[CLOCK_BOOTTIME_ALARM] = boottime; +} + +struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops(). + * For more details check_vma_flags() and __access_remote_vm() + */ + + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} + +static void timens_set_vvar_page(struct task_struct *task, + struct time_namespace *ns) +{ + struct vdso_time_data *vdata; + struct vdso_clock *vc; + unsigned int i; + + if (ns == &init_time_ns) + return; + + /* Fast-path, taken by every task in namespace except the first. */ + if (likely(ns->frozen_offsets)) + return; + + guard(mutex)(&timens_offset_lock); + /* Nothing to-do: vvar_page has been already initialized. */ + if (ns->frozen_offsets) + return; + + ns->frozen_offsets = true; + vdata = page_address(ns->vvar_page); + vc = vdata->clock_data; + + for (i = 0; i < CS_BASES; i++) + timens_setup_vdso_clock_data(&vc[i], ns); + + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { + for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) + timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); + } +} + +/* + * The vvar page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will be re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_clock_data() for details. + */ +static int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + guard(mmap_read_lock)(mm); + for_each_vma(vmi, vma) { + if (vma_is_special_mapping(vma, &vdso_vvar_mapping)) + zap_vma(vma); + } + return 0; +} + +void timens_commit(struct task_struct *tsk, struct time_namespace *ns) +{ + timens_set_vvar_page(tsk, ns); + vdso_join_timens(tsk, ns); +} + +int timens_vdso_alloc_vvar_page(struct time_namespace *ns) +{ + ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!ns->vvar_page) + return -ENOMEM; + + return 0; +} + +void timens_vdso_free_vvar_page(struct time_namespace *ns) +{ + __free_page(ns->vvar_page); +} diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 413e2389f0a5..9331e1614124 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1092,7 +1092,7 @@ void exit_itimers(struct task_struct *tsk) } /* - * There should be no timers on the ignored list. itimer_delete() has + * There should be no timers on the ignored list. posix_timer_delete() has * mopped them up. */ if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers))) diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index a88b72b0f35e..51f6a1032c83 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -78,7 +78,6 @@ static struct clock_event_device ce_broadcast_hrtimer = { .set_state_shutdown = bc_shutdown, .set_next_ktime = bc_set_next, .features = CLOCK_EVT_FEAT_ONESHOT | - CLOCK_EVT_FEAT_KTIME | CLOCK_EVT_FEAT_HRTIMER, .rating = 0, .bound_on = -1, diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f63c65881364..7e57fa31ee26 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -76,8 +76,10 @@ const struct clock_event_device *tick_get_wakeup_device(int cpu) */ static void tick_broadcast_start_periodic(struct clock_event_device *bc) { - if (bc) + if (bc) { + bc->next_event_forced = 0; tick_setup_periodic(bc, 1); + } } /* @@ -403,6 +405,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) bool bc_local; raw_spin_lock(&tick_broadcast_lock); + tick_broadcast_device.evtdev->next_event_forced = 0; /* Handle spurious interrupts gracefully */ if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { @@ -696,6 +699,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) raw_spin_lock(&tick_broadcast_lock); dev->next_event = KTIME_MAX; + tick_broadcast_device.evtdev->next_event_forced = 0; next_event = KTIME_MAX; cpumask_clear(tmpmask); now = ktime_get(); @@ -1063,6 +1067,7 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bc->event_handler = tick_handle_oneshot_broadcast; + bc->next_event_forced = 0; bc->next_event = KTIME_MAX; /* @@ -1175,6 +1180,7 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) } /* This moves the broadcast assignment to this CPU: */ + bc->next_event_forced = 0; clockevents_program_event(bc, bc->next_event, 1); } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index d305d8521896..6a9198a4279b 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -110,6 +110,7 @@ void tick_handle_periodic(struct clock_event_device *dev) int cpu = smp_processor_id(); ktime_t next = dev->next_event; + dev->next_event_forced = 0; tick_periodic(cpu); /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f7907fadd63f..cbbb87a0c6e7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -345,7 +345,7 @@ static bool check_tick_dependency(atomic_t *dep) int val = atomic_read(dep); if (likely(!tracepoint_enabled(tick_stop))) - return !val; + return !!val; if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); @@ -864,19 +864,32 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); +/* Simplified variant of hrtimer_forward_now() */ +static ktime_t tick_forward_now(ktime_t expires, ktime_t now) +{ + ktime_t delta = now - expires; + + if (likely(delta < TICK_NSEC)) + return expires + TICK_NSEC; + + expires += TICK_NSEC * ktime_divns(delta, TICK_NSEC); + if (expires > now) + return expires; + return expires + TICK_NSEC; +} + static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { - hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->last_tick); + ktime_t expires = ts->last_tick; - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); + if (now >= expires) + expires = tick_forward_now(expires, now); if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED_HARD); + hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { - tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + hrtimer_set_expires(&ts->sched_timer, expires); + tick_program_event(expires, 1); } /* @@ -1513,6 +1526,7 @@ static void tick_nohz_lowres_handler(struct clock_event_device *dev) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); dev->next_event = KTIME_MAX; + dev->next_event_forced = 0; if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); diff --git a/kernel/time/time.c b/kernel/time/time.c index 0ba8e3c50d62..0d832317d576 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -365,20 +365,16 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) } #endif +#if HZ > MSEC_PER_SEC || (MSEC_PER_SEC % HZ) /** * jiffies_to_msecs - Convert jiffies to milliseconds * @j: jiffies value * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases. - * * Return: milliseconds value */ unsigned int jiffies_to_msecs(const unsigned long j) { -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) +#if HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); #else # if BITS_PER_LONG == 32 @@ -390,7 +386,9 @@ unsigned int jiffies_to_msecs(const unsigned long j) #endif } EXPORT_SYMBOL(jiffies_to_msecs); +#endif +#if (USEC_PER_SEC % HZ) /** * jiffies_to_usecs - Convert jiffies to microseconds * @j: jiffies value @@ -405,17 +403,14 @@ unsigned int jiffies_to_usecs(const unsigned long j) */ BUILD_BUG_ON(HZ > USEC_PER_SEC); -#if !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#else -# if BITS_PER_LONG == 32 +#if BITS_PER_LONG == 32 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; -# else +#else return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; -# endif #endif } EXPORT_SYMBOL(jiffies_to_usecs); +#endif /** * mktime64 - Converts date to seconds. @@ -702,7 +697,7 @@ EXPORT_SYMBOL(clock_t_to_jiffies); * * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) */ -u64 jiffies_64_to_clock_t(u64 x) +notrace u64 jiffies_64_to_clock_t(u64 x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 91fa2003351c..c493a4010305 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -3,34 +3,30 @@ * Kernel timekeeping code and accessor functions. Based on code from * timer.c, moved in commit 8524070b7982. */ -#include <linux/timekeeper_internal.h> -#include <linux/module.h> -#include <linux/interrupt.h> +#include <linux/audit.h> +#include <linux/clocksource.h> +#include <linux/compiler.h> +#include <linux/jiffies.h> #include <linux/kobject.h> -#include <linux/percpu.h> -#include <linux/init.h> -#include <linux/mm.h> +#include <linux/module.h> #include <linux/nmi.h> -#include <linux/sched.h> -#include <linux/sched/loadavg.h> +#include <linux/pvclock_gtod.h> +#include <linux/random.h> #include <linux/sched/clock.h> +#include <linux/sched/loadavg.h> +#include <linux/static_key.h> +#include <linux/stop_machine.h> #include <linux/syscore_ops.h> -#include <linux/clocksource.h> -#include <linux/jiffies.h> +#include <linux/tick.h> #include <linux/time.h> #include <linux/timex.h> -#include <linux/tick.h> -#include <linux/stop_machine.h> -#include <linux/pvclock_gtod.h> -#include <linux/compiler.h> -#include <linux/audit.h> -#include <linux/random.h> +#include <linux/timekeeper_internal.h> #include <vdso/auxclock.h> #include "tick-internal.h" -#include "ntp_internal.h" #include "timekeeping_internal.h" +#include "ntp_internal.h" #define TK_CLEAR_NTP (1 << 0) #define TK_CLOCK_WAS_SET (1 << 1) @@ -275,6 +271,11 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } +#ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE +#include <asm/clock_inlined.h> + +static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined); + /* * tk_clock_read - atomic clocksource read() helper * @@ -288,12 +289,35 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ -static inline u64 tk_clock_read(const struct tk_read_base *tkr) +static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + if (static_branch_likely(&clocksource_read_inlined)) + return arch_inlined_clocksource_read(clock); + + return clock->read(clock); +} + +static inline void clocksource_disable_inline_read(void) +{ + static_branch_disable(&clocksource_read_inlined); +} + +static inline void clocksource_enable_inline_read(void) +{ + static_branch_enable(&clocksource_read_inlined); +} +#else +static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); return clock->read(clock); } +static inline void clocksource_disable_inline_read(void) { } +static inline void clocksource_enable_inline_read(void) { } +#endif /** * tk_setup_internals - Set up internals to use clocksource clock. @@ -367,6 +391,27 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; tk->skip_second_overflow = 0; + + tk->cs_id = clock->id; + + /* Coupled clockevent data */ + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) && + clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) { + /* + * Aim for an one hour maximum delta and use KHz to handle + * clocksources with a frequency above 4GHz correctly as + * the frequency argument of clocks_calc_mult_shift() is u32. + */ + clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift, + NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000); + /* + * Initialize the conversion limit as the previous clocksource + * might have the same shift/mult pair so the quick check in + * tk_update_ns_to_cyc() fails to update it after a clocksource + * change leaving it effectivly zero. + */ + tk->cs_ns_to_cyc_maxns = div_u64(clock->mask, tk->cs_ns_to_cyc_mult); + } } /* Timekeeper helper functions. */ @@ -375,7 +420,7 @@ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) +static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; @@ -696,6 +741,36 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } +static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc) +{ + struct tk_read_base *tkrs = &tks->tkr_mono; + struct tk_read_base *tkrc = &tkc->tkr_mono; + unsigned int shift; + + if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) || + !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT)) + return; + + if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift) + return; + /* + * The conversion math is simple: + * + * CS::MULT (1 << NS_TO_CYC_SHIFT) + * --------------- = ---------------------- + * (1 << CS:SHIFT) NS_TO_CYC_MULT + * + * Ergo: + * + * NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT + * + * NS_TO_CYC_SHIFT has been set up in tk_setup_internals() + */ + shift = tkrs->shift + tks->cs_ns_to_cyc_shift; + tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult); + tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult); +} + /* * Restore the shadow timekeeper from the real timekeeper. */ @@ -730,6 +805,7 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; if (tk->id == TIMEKEEPER_CORE) { + tk_update_ns_to_cyc(tk, &tkd->timekeeper); update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); @@ -784,6 +860,71 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_update_coarse_nsecs(tk); } +/* + * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles + * @id: Clocksource ID which is required for validity + * @expires_ns: Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted + * @cycles: Pointer to storage for corresponding absolute cycles value + * + * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value + * based on the correlated clocksource of the clockevent device by using + * the base nanoseconds and cycles values of the last timekeeper update and + * converting the delta between @expires_ns and base nanoseconds to cycles. + * + * This only works for clockevent devices which are using a less than or + * equal comparator against the clocksource. + * + * Utilizing this avoids two clocksource reads for such devices, the + * ktime_get() in clockevents_program_event() to calculate the delta expiry + * value and the readout in the device::set_next_event() callback to + * convert the delta back to a absolute comparator value. + * + * Returns: True if @id matches the current clocksource ID, false otherwise + */ +bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct tk_read_base *tkrm = &tk->tkr_mono; + ktime_t base_ns, delta_ns, max_ns; + u64 base_cycles, delta_cycles; + unsigned int seq; + u32 mult, shift; + + /* + * Racy check to avoid the seqcount overhead when ID does not match. If + * the relevant clocksource is installed concurrently, then this will + * just delay the switch over to this mechanism until the next event is + * programmed. If the ID is not matching the clock events code will use + * the regular relative set_next_event() callback as before. + */ + if (data_race(tk->cs_id) != id) + return false; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + if (tk->cs_id != id) + return false; + + base_cycles = tkrm->cycle_last; + base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift); + + mult = tk->cs_ns_to_cyc_mult; + shift = tk->cs_ns_to_cyc_shift; + max_ns = tk->cs_ns_to_cyc_maxns; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + /* Prevent negative deltas and multiplication overflows */ + delta_ns = min(expires_ns - base_ns, max_ns); + delta_ns = max(delta_ns, 0); + + /* Convert to cycles */ + delta_cycles = ((u64)delta_ns * mult) >> shift; + *cycles = base_cycles + delta_cycles; + return true; +} + /** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set @@ -848,7 +989,7 @@ u32 ktime_get_resolution_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); -static ktime_t *offsets[TK_OFFS_MAX] = { +static const ktime_t *const offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, @@ -857,8 +998,9 @@ static ktime_t *offsets[TK_OFFS_MAX] = { ktime_t ktime_get_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; + const ktime_t *offset = offsets[offs]; unsigned int seq; - ktime_t base, *offset = offsets[offs]; + ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -878,8 +1020,9 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; - ktime_t base, *offset = offsets[offs]; + const ktime_t *offset = offsets[offs]; unsigned int seq; + ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -902,7 +1045,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); */ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { - ktime_t *offset = offsets[offs]; + const ktime_t *offset = offsets[offs]; unsigned int seq; ktime_t tconv; @@ -1631,7 +1774,19 @@ int timekeeping_notify(struct clocksource *clock) if (tk->tkr_mono.clock == clock) return 0; + + /* Disable inlined reads accross the clocksource switch */ + clocksource_disable_inline_read(); + stop_machine(change_clocksource, clock, NULL); + + /* + * If the clocksource has been selected and supports inlined reads + * enable the branch. + */ + if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ) + clocksource_enable_inline_read(); + tick_clock_notify(); return tk->tkr_mono.clock == clock ? 0 : -1; } @@ -2653,7 +2808,8 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux if (aux_clock) { /* Auxiliary clocks are similar to TAI and do not have leap seconds */ - if (txc->status & (STA_INS | STA_DEL)) + if (txc->modes & ADJ_STATUS && + txc->status & (STA_INS | STA_DEL)) return -EINVAL; /* No TAI offset setting */ @@ -2661,7 +2817,8 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux return -EINVAL; /* No PPS support either */ - if (txc->status & (STA_PPSFREQ | STA_PPSTIME)) + if (txc->modes & ADJ_STATUS && + txc->status & (STA_PPSFREQ | STA_PPSTIME)) return -EINVAL; } @@ -2832,7 +2989,7 @@ static void tk_aux_update_clocksource(void) continue; timekeeping_forward_now(tks); - tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock); + tk_setup_internals(tks, tk_core.timekeeper.tkr_raw.clock); timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); } } diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 543beba096c7..198d0608db74 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -9,6 +9,8 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_boot, ktime_t *offs_tai); +bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles); + extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); extern void timekeeping_warp_clock(void); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 7e1e3bde6b8b..04d928c21aba 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2319,6 +2319,7 @@ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle) */ void timer_clear_idle(void) { + int this_cpu = smp_processor_id(); /* * We do this unlocked. The worst outcome is a remote pinned timer * enqueue sending a pointless IPI, but taking the lock would just @@ -2327,9 +2328,9 @@ void timer_clear_idle(void) * path. Required for BASE_LOCAL only. */ __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false); - if (tick_nohz_full_cpu(smp_processor_id())) + if (tick_nohz_full_cpu(this_cpu)) __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false); - trace_timer_base_idle(false, smp_processor_id()); + trace_timer_base_idle(false, this_cpu); /* Activate without holding the timer_base->lock */ tmigr_cpu_activate(); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 488e47e96e93..427d7ddea3af 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,7 +47,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); - SEQ_printf(m, ", S:%02x", timer->state); + SEQ_printf(m, ", S:%02x", timer->is_queued); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), @@ -56,13 +56,11 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } -static void -print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, - u64 now) +static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + struct timerqueue_linked_node *curr; struct hrtimer *timer, tmp; unsigned long next = 0, i; - struct timerqueue_node *curr; unsigned long flags; next_one: @@ -72,13 +70,13 @@ next_one: raw_spin_lock_irqsave(&base->cpu_base->lock, flags); - curr = timerqueue_getnext(&base->active); + curr = timerqueue_linked_first(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { - curr = timerqueue_iterate_next(curr); + curr = timerqueue_linked_next(curr); i++; } @@ -103,8 +101,8 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); #ifdef CONFIG_HIGH_RES_TIMERS - SEQ_printf(m, " .offset: %Lu nsecs\n", - (unsigned long long) ktime_to_ns(base->offset)); + SEQ_printf(m, " .offset: %Ld nsecs\n", + (long long) base->offset); #endif SEQ_printf(m, "active timers:\n"); print_active_timers(m, base, now + ktime_to_ns(base->offset)); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index c1ed0d5e8de6..155eeaea4113 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1559,8 +1559,6 @@ int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; int cpu; - lockdep_assert_cpus_held(); - if (!works) return -ENOMEM; if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) @@ -1570,6 +1568,7 @@ int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) * First set previously isolated CPUs as available (unisolate). * This cpumask contains only CPUs that switched to available now. */ + guard(cpus_read_lock)(); cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask); cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask); @@ -1626,7 +1625,6 @@ static int __init tmigr_init_isolation(void) cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); /* Protect against RCU torture hotplug testing */ - guard(cpus_read_lock)(); return tmigr_isolated_exclude_cpumask(cpumask); } late_initcall(tmigr_init_isolation); diff --git a/kernel/torture.c b/kernel/torture.c index ec3370986976..62c1ac777694 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -93,7 +93,7 @@ int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode { ktime_t hto = baset_ns; - if (trsp) + if (trsp && fuzzt_ns) hto += torture_random(trsp) % fuzzt_ns; set_current_state(TASK_IDLE); return schedule_hrtimeout(&hto, mode); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 49de13cae428..e130da35808f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1281,4 +1281,18 @@ config HIST_TRIGGERS_DEBUG source "kernel/trace/rv/Kconfig" +config TRACE_REMOTE + bool + +config SIMPLE_RING_BUFFER + bool + +config TRACE_REMOTE_TEST + tristate "Test module for remote tracing" + select TRACE_REMOTE + select SIMPLE_RING_BUFFER + help + This trace remote includes a ring-buffer writer implementation using + "simple_ring_buffer". This is solely intending for testing. + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 04096c21d06b..4d4229e5eec4 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_TRACING) += trace_pid.o +obj-$(CONFIG_TRACER_SNAPSHOT) += trace_snapshot.o obj-$(CONFIG_TRACING) += pid_list.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o @@ -128,4 +129,63 @@ obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ +obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o +obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o +obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o + +# +# simple_ring_buffer is used by the pKVM hypervisor which does not have access +# to all kernel symbols. Fail the build if forbidden symbols are found. +# +# undefsyms_base generates a set of compiler and tooling-generated symbols that can +# safely be ignored for simple_ring_buffer. +# +filechk_undefsyms_base = \ + echo '$(pound)include <linux/atomic.h>'; \ + echo '$(pound)include <linux/string.h>'; \ + echo '$(pound)include <asm/page.h>'; \ + echo 'static char page[PAGE_SIZE] __aligned(PAGE_SIZE);'; \ + echo 'void undefsyms_base(void *p, int n);'; \ + echo 'void undefsyms_base(void *p, int n) {'; \ + echo ' char buffer[256] = { 0 };'; \ + echo ' u32 u = 0;'; \ + echo ' memset((char * volatile)page, 8, PAGE_SIZE);'; \ + echo ' memset((char * volatile)buffer, 8, sizeof(buffer));'; \ + echo ' memcpy((void * volatile)p, buffer, sizeof(buffer));'; \ + echo ' cmpxchg((u32 * volatile)&u, 0, 8);'; \ + echo ' WARN_ON(n == 0xdeadbeef);'; \ + echo '}' + +$(obj)/undefsyms_base.c: FORCE + $(call filechk,undefsyms_base) + +clean-files += undefsyms_base.c + +$(obj)/undefsyms_base.o: $(obj)/undefsyms_base.c + +targets += undefsyms_base.o + +# Ensure KASAN is enabled to avoid logic that may disable FORTIFY_SOURCE when +# KASAN is not enabled. undefsyms_base.o does not automatically get KASAN flags +# because it is not linked into vmlinux. +KASAN_SANITIZE_undefsyms_base.o := y + +UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \ + __msan simple_ring_buffer \ + $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}') + +quiet_cmd_check_undefined = NM $< + cmd_check_undefined = \ + undefsyms=$$($(NM) -u $< | grep -v $(addprefix -e , $(UNDEFINED_ALLOWLIST)) || true); \ + if [ -n "$$undefsyms" ]; then \ + echo "Unexpected symbols in $<:" >&2; \ + echo "$$undefsyms" >&2; \ + false; \ + fi + +$(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE + $(call if_changed,check_undefined) + +always-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o.checked + libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 30259dcaa838..8cd2520b4c99 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -383,8 +383,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, cpu = raw_smp_processor_id(); if (blk_tracer) { - tracing_record_cmdline(current); - buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); switch (bt->version) { @@ -419,6 +417,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (!event) return; + tracing_record_cmdline(current); switch (bt->version) { case 1: record_blktrace_event(ring_buffer_event_data(event), diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9bc0dfd235af..af7079aa0f36 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2454,8 +2454,10 @@ static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_kprobe_multi_link *kmulti_link; + bool has_cookies; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + has_cookies = !!kmulti_link->cookies; seq_printf(seq, "kprobe_cnt:\t%u\n" @@ -2467,7 +2469,7 @@ static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, for (int i = 0; i < kmulti_link->cnt; i++) { seq_printf(seq, "%llu\t %pS\n", - kmulti_link->cookies[i], + has_cookies ? kmulti_link->cookies[i] : 0, (void *)kmulti_link->addrs[i]); } } @@ -2750,6 +2752,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!is_kprobe_multi(prog)) return -EINVAL; + /* kprobe_multi is not allowed to be sleepable. */ + if (prog->sleepable) + return -EINVAL; + /* Writing to context is not allowed for kprobes. */ if (prog->aux->kprobe_write_ctx) return -EINVAL; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index dcadf1d23b8a..56d145017902 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -450,8 +450,6 @@ static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops used += FPROBE_HEADER_SIZE_IN_LONG + size_words; } } - if (used < reserved_words) - memset(fgraph_data + used, 0, reserved_words - used); /* If any exit_handler is set, data must be used. */ return used != 0; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 827fb9a0bf0d..b2611de3f594 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6404,6 +6404,7 @@ int update_ftrace_direct_add(struct ftrace_ops *ops, struct ftrace_hash *hash) new_filter_hash = old_filter_hash; } } else { + guard(mutex)(&ftrace_lock); err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH); /* * new_filter_hash is dup-ed, so we need to release it anyway, @@ -6530,6 +6531,7 @@ int update_ftrace_direct_del(struct ftrace_ops *ops, struct ftrace_hash *hash) ops->func_hash->filter_hash = NULL; } } else { + guard(mutex)(&ftrace_lock); err = ftrace_update_ops(ops, new_filter_hash, EMPTY_HASH); /* * new_filter_hash is dup-ed, so we need to release it anyway, @@ -6604,9 +6606,9 @@ int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, b if (!orig_hash) goto unlock; - /* Enable the tmp_ops to have the same functions as the direct ops */ + /* Enable the tmp_ops to have the same functions as the hash object. */ ftrace_ops_init(&tmp_ops); - tmp_ops.func_hash = ops->func_hash; + tmp_ops.func_hash->filter_hash = hash; err = register_ftrace_function_nolock(&tmp_ops); if (err) @@ -6839,7 +6841,8 @@ bool ftrace_filter_param __initdata; static int __init set_ftrace_notrace(char *str) { ftrace_filter_param = true; - strscpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + trace_append_boot_param(ftrace_notrace_buf, str, ',', + FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); @@ -6847,7 +6850,8 @@ __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { ftrace_filter_param = true; - strscpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + trace_append_boot_param(ftrace_filter_buf, str, ',', + FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); @@ -6859,14 +6863,16 @@ static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer); static int __init set_graph_function(char *str) { - strscpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + trace_append_boot_param(ftrace_graph_buf, str, ',', + FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_filter=", set_graph_function); static int __init set_graph_notrace_function(char *str) { - strscpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); + trace_append_boot_param(ftrace_graph_notrace_buf, str, ',', + FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_graph_notrace=", set_graph_notrace_function); @@ -8611,6 +8617,7 @@ ftrace_pid_follow_sched_process_fork(void *data, struct trace_pid_list *pid_list; struct trace_array *tr = data; + guard(preempt)(); pid_list = rcu_dereference_sched(tr->function_pids); trace_filter_add_remove_task(pid_list, self, task); @@ -8624,6 +8631,7 @@ ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task) struct trace_pid_list *pid_list; struct trace_array *tr = data; + guard(preempt)(); pid_list = rcu_dereference_sched(tr->function_pids); trace_filter_add_remove_task(pid_list, NULL, task); @@ -9263,6 +9271,15 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr) * @addrs array, which needs to be big enough to store at least @cnt * addresses. * + * For a single symbol (cnt == 1), uses kallsyms_lookup_name() which + * performs an O(log N) binary search via the sorted kallsyms index. + * This avoids the full O(N) linear scan over all kernel symbols that + * the multi-symbol path requires. + * + * For multiple symbols, uses a single-pass linear scan via + * kallsyms_on_each_symbol() with binary search into the sorted input + * array. + * * Returns: 0 if all provided symbols are found, -ESRCH otherwise. */ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) @@ -9270,6 +9287,19 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a struct kallsyms_data args; int found_all; + /* Fast path: single symbol uses O(log N) binary search */ + if (cnt == 1) { + addrs[0] = kallsyms_lookup_name(sorted_syms[0]); + if (addrs[0] && ftrace_location(addrs[0])) + return 0; + /* + * Binary lookup can fail for duplicate symbol names + * where the first match is not ftrace-instrumented. + * Retry with linear scan. + */ + } + + /* Batch path: single-pass O(N) linear scan */ memset(addrs, 0, sizeof(*addrs) * cnt); args.addrs = addrs; args.syms = sorted_syms; diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c new file mode 100644 index 000000000000..6c1b7701ddae --- /dev/null +++ b/kernel/trace/remote_test.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort <vdonnefort@google.com> + */ + +#include <linux/module.h> +#include <linux/simple_ring_buffer.h> +#include <linux/trace_remote.h> +#include <linux/tracefs.h> +#include <linux/types.h> + +#define REMOTE_EVENT_INCLUDE_FILE kernel/trace/remote_test_events.h +#include <trace/define_remote_events.h> + +static DEFINE_PER_CPU(struct simple_rb_per_cpu *, simple_rbs); +static struct trace_buffer_desc *remote_test_buffer_desc; + +/* + * The trace_remote lock already serializes accesses from the trace_remote_callbacks. + * However write_event can still race with load/unload. + */ +static DEFINE_MUTEX(simple_rbs_lock); + +static int remote_test_load_simple_rb(int cpu, struct ring_buffer_desc *rb_desc) +{ + struct simple_rb_per_cpu *cpu_buffer; + struct simple_buffer_page *bpages; + int ret = -ENOMEM; + + cpu_buffer = kmalloc_obj(*cpu_buffer); + if (!cpu_buffer) + return ret; + + bpages = kmalloc_objs(*bpages, rb_desc->nr_page_va); + if (!bpages) + goto err_free_cpu_buffer; + + ret = simple_ring_buffer_init(cpu_buffer, bpages, rb_desc); + if (ret) + goto err_free_bpages; + + scoped_guard(mutex, &simple_rbs_lock) { + WARN_ON(*per_cpu_ptr(&simple_rbs, cpu)); + *per_cpu_ptr(&simple_rbs, cpu) = cpu_buffer; + } + + return 0; + +err_free_bpages: + kfree(bpages); + +err_free_cpu_buffer: + kfree(cpu_buffer); + + return ret; +} + +static void remote_test_unload_simple_rb(int cpu) +{ + struct simple_rb_per_cpu *cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu); + struct simple_buffer_page *bpages; + + if (!cpu_buffer) + return; + + guard(mutex)(&simple_rbs_lock); + + bpages = cpu_buffer->bpages; + simple_ring_buffer_unload(cpu_buffer); + kfree(bpages); + kfree(cpu_buffer); + *per_cpu_ptr(&simple_rbs, cpu) = NULL; +} + +static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unused) +{ + struct ring_buffer_desc *rb_desc; + struct trace_buffer_desc *desc; + size_t desc_size; + int cpu, ret; + + if (WARN_ON(remote_test_buffer_desc)) + return ERR_PTR(-EINVAL); + + desc_size = trace_buffer_desc_size(size, num_possible_cpus()); + if (desc_size == SIZE_MAX) { + ret = -E2BIG; + goto err; + } + + desc = kmalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto err; + } + + ret = trace_remote_alloc_buffer(desc, desc_size, size, cpu_possible_mask); + if (ret) + goto err_free_desc; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) { + ret = remote_test_load_simple_rb(rb_desc->cpu, rb_desc); + if (ret) + goto err_unload; + } + + remote_test_buffer_desc = desc; + + return remote_test_buffer_desc; + +err_unload: + for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc) + remote_test_unload_simple_rb(rb_desc->cpu); + trace_remote_free_buffer(remote_test_buffer_desc); + +err_free_desc: + kfree(desc); + +err: + return ERR_PTR(ret); +} + +static void remote_test_unload(struct trace_buffer_desc *desc, void *unused) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + if (WARN_ON(desc != remote_test_buffer_desc)) + return; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) + remote_test_unload_simple_rb(rb_desc->cpu); + + remote_test_buffer_desc = NULL; + trace_remote_free_buffer(desc); + kfree(desc); +} + +static int remote_test_enable_tracing(bool enable, void *unused) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + if (!remote_test_buffer_desc) + return -ENODEV; + + for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc) + WARN_ON(simple_ring_buffer_enable_tracing(*per_cpu_ptr(&simple_rbs, rb_desc->cpu), + enable)); + return 0; +} + +static int remote_test_swap_reader_page(unsigned int cpu, void *unused) +{ + struct simple_rb_per_cpu *cpu_buffer; + + if (cpu >= NR_CPUS) + return -EINVAL; + + cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu); + if (!cpu_buffer) + return -EINVAL; + + return simple_ring_buffer_swap_reader_page(cpu_buffer); +} + +static int remote_test_reset(unsigned int cpu, void *unused) +{ + struct simple_rb_per_cpu *cpu_buffer; + + if (cpu >= NR_CPUS) + return -EINVAL; + + cpu_buffer = *per_cpu_ptr(&simple_rbs, cpu); + if (!cpu_buffer) + return -EINVAL; + + return simple_ring_buffer_reset(cpu_buffer); +} + +static int remote_test_enable_event(unsigned short id, bool enable, void *unused) +{ + if (id != REMOTE_TEST_EVENT_ID) + return -EINVAL; + + /* + * Let's just use the struct remote_event enabled field that is turned on and off by + * trace_remote. This is a bit racy but good enough for a simple test module. + */ + return 0; +} + +static ssize_t +write_event_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *pos) +{ + struct remote_event_format_selftest *evt_test; + struct simple_rb_per_cpu *cpu_buffer; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&simple_rbs_lock); + + if (!remote_event_selftest.enabled) + return -ENODEV; + + guard(preempt)(); + + cpu_buffer = *this_cpu_ptr(&simple_rbs); + if (!cpu_buffer) + return -ENODEV; + + evt_test = simple_ring_buffer_reserve(cpu_buffer, + sizeof(struct remote_event_format_selftest), + trace_clock_global()); + if (!evt_test) + return -ENODEV; + + evt_test->hdr.id = REMOTE_TEST_EVENT_ID; + evt_test->id = val; + + simple_ring_buffer_commit(cpu_buffer); + + return cnt; +} + +static const struct file_operations write_event_fops = { + .write = write_event_write, +}; + +static int remote_test_init_tracefs(struct dentry *d, void *unused) +{ + return tracefs_create_file("write_event", 0200, d, NULL, &write_event_fops) ? + 0 : -ENOMEM; +} + +static struct trace_remote_callbacks trace_remote_callbacks = { + .init = remote_test_init_tracefs, + .load_trace_buffer = remote_test_load, + .unload_trace_buffer = remote_test_unload, + .enable_tracing = remote_test_enable_tracing, + .swap_reader_page = remote_test_swap_reader_page, + .reset = remote_test_reset, + .enable_event = remote_test_enable_event, +}; + +static int __init remote_test_init(void) +{ + return trace_remote_register("test", &trace_remote_callbacks, NULL, + &remote_event_selftest, 1); +} + +module_init(remote_test_init); + +MODULE_DESCRIPTION("Test module for the trace remote interface"); +MODULE_AUTHOR("Vincent Donnefort"); +MODULE_LICENSE("GPL"); diff --git a/kernel/trace/remote_test_events.h b/kernel/trace/remote_test_events.h new file mode 100644 index 000000000000..26b93b3406fc --- /dev/null +++ b/kernel/trace/remote_test_events.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define REMOTE_TEST_EVENT_ID 1 + +REMOTE_EVENT(selftest, REMOTE_TEST_EVENT_ID, + RE_STRUCT( + re_field(u64, id) + ), + RE_PRINTK("id=%llu", __entry->id) +); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f16f053ef77d..cef49f8871d2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ +#include <linux/ring_buffer_types.h> #include <linux/sched/isolation.h> #include <linux/trace_recursion.h> #include <linux/trace_events.h> @@ -157,23 +158,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s) /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) -#define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) -#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ - -#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS -# define RB_FORCE_8BYTE_ALIGNMENT 0 -# define RB_ARCH_ALIGNMENT RB_ALIGNMENT -#else -# define RB_FORCE_8BYTE_ALIGNMENT 1 -# define RB_ARCH_ALIGNMENT 8U -#endif - -#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) - /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -316,10 +300,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) -#define TS_SHIFT 27 -#define TS_MASK ((1ULL << TS_SHIFT) - 1) -#define TS_DELTA_TEST (~TS_MASK) - static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; @@ -338,12 +318,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) #define RB_MISSED_MASK (3 << 30) -struct buffer_data_page { - u64 time_stamp; /* page time stamp */ - local_t commit; /* write committed index */ - unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ -}; - struct buffer_data_read_page { unsigned order; /* order of the page */ struct buffer_data_page *data; /* actual data, stored in this page */ @@ -437,14 +411,6 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order) return dpage; } -/* - * We need to fit the time_stamp delta into 27 bits. - */ -static inline bool test_time_stamp(u64 delta) -{ - return !!(delta & TS_DELTA_TEST); -} - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; @@ -555,10 +521,12 @@ struct ring_buffer_per_cpu { unsigned int mapped; unsigned int user_mapped; /* user space mapping */ struct mutex mapping_lock; - unsigned long *subbuf_ids; /* ID to subbuf VA */ + struct buffer_page **subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; struct ring_buffer_cpu_meta *ring_meta; + struct ring_buffer_remote *remote; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -581,6 +549,8 @@ struct trace_buffer { struct ring_buffer_per_cpu **buffers; + struct ring_buffer_remote *remote; + struct hlist_node node; u64 (*clock)(void); @@ -627,16 +597,17 @@ int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq (unsigned int)sizeof(field.commit), (unsigned int)is_signed_type(long)); - trace_seq_printf(s, "\tfield: int overwrite;\t" + trace_seq_printf(s, "\tfield: char overwrite;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), 1, - (unsigned int)is_signed_type(long)); + (unsigned int)is_signed_type(char)); trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)buffer->subbuf_size, + (unsigned int)(buffer ? buffer->subbuf_size : + PAGE_SIZE - BUF_PAGE_HDR_SIZE), (unsigned int)is_signed_type(char)); return !trace_seq_has_overflowed(s); @@ -2053,7 +2024,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) entries += ret; entry_bytes += local_read(&head_page->page->commit); - local_set(&cpu_buffer->head_page->entries, ret); + local_set(&head_page->entries, ret); if (head_page == cpu_buffer->commit_page) break; @@ -2238,6 +2209,40 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, } } +static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu) +{ + struct ring_buffer_desc *desc, *end; + size_t len; + int i; + + if (!trace_desc) + return NULL; + + if (cpu >= trace_desc->nr_cpus) + return NULL; + + end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len); + desc = __first_ring_buffer_desc(trace_desc); + len = struct_size(desc, page_va, desc->nr_page_va); + desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu)); + + if (desc < end && desc->cpu == cpu) + return desc; + + /* Missing CPUs, need to linear search */ + for_each_ring_buffer_desc(desc, i, trace_desc) { + if (desc->cpu == cpu) + return desc; + } + + return NULL; +} + +static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, unsigned int page_id) +{ + return page_id >= desc->nr_page_va ? NULL : (void *)desc->page_va[page_id]; +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { @@ -2245,6 +2250,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; + struct ring_buffer_desc *desc = NULL; long i; /* @@ -2273,6 +2279,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (buffer->range_addr_start) meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); + if (buffer->remote) { + desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu); + if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1))) + return -EINVAL; + } + for (i = 0; i < nr_pages; i++) { bpage = alloc_cpu_page(cpu_buffer->cpu); @@ -2297,6 +2309,16 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; + } else if (desc) { + void *p = ring_buffer_desc_page(desc, i + 1); + + if (WARN_ON(!p)) + goto free_pages; + + bpage->page = p; + bpage->range = 1; /* bpage->page can't be freed */ + bpage->id = i + 1; + cpu_buffer->subbuf_ids[i + 1] = bpage; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu_buffer->cpu, order); @@ -2394,6 +2416,30 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (cpu_buffer->ring_meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; + } else if (buffer->remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu); + + if (!desc) + goto fail_free_reader; + + cpu_buffer->remote = buffer->remote; + cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va; + cpu_buffer->nr_pages = nr_pages; + cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, + sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL); + if (!cpu_buffer->subbuf_ids) + goto fail_free_reader; + + /* Remote buffers are read-only and immutable */ + atomic_inc(&cpu_buffer->record_disabled); + atomic_inc(&cpu_buffer->resize_disabled); + + bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id); + if (!bpage->page) + goto fail_free_reader; + + bpage->range = 1; + cpu_buffer->subbuf_ids[0] = bpage; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu, order); @@ -2453,6 +2499,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) irq_work_sync(&cpu_buffer->irq_work.work); + if (cpu_buffer->remote) + kfree(cpu_buffer->subbuf_ids); + free_buffer_page(cpu_buffer->reader_page); if (head) { @@ -2475,7 +2524,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, unsigned long scratch_size, - struct lock_class_key *key) + struct lock_class_key *key, + struct ring_buffer_remote *remote) { struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; @@ -2515,6 +2565,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; + cpu = raw_smp_processor_id(); + /* If start/end are specified, then that overrides size */ if (start && end) { unsigned long buffers_start; @@ -2570,6 +2622,15 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, buffer->range_addr_end = end; rb_range_meta_init(buffer, nr_pages, scratch_size); + } else if (remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu); + + buffer->remote = remote; + /* The writer is remote. This ring-buffer is read-only */ + atomic_inc(&buffer->record_disabled); + nr_pages = desc->nr_page_va - 1; + if (nr_pages < 2) + goto fail_free_buffers; } else { /* need at least two pages */ @@ -2578,7 +2639,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, nr_pages = 2; } - cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) @@ -2620,7 +2680,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ - return alloc_buffer(size, flags, 0, 0, 0, 0, key); + return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); @@ -2647,7 +2707,18 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag struct lock_class_key *key) { return alloc_buffer(size, flags, order, start, start + range_size, - scratch_size, key); + scratch_size, key, NULL); +} + +/** + * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote + * @remote: Contains a description of the ring-buffer pages and remote callbacks. + * @key: ring buffer reader_lock_key. + */ +struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key) +{ + return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote); } void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) @@ -4435,18 +4506,20 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta); if (ret < 0) { if (delta < ts) { - buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", - cpu_buffer->cpu, ts, delta); + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n", + cpu_buffer->cpu, ts, delta, + cpu_buffer->buffer->clock); goto out; } } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { - buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", + buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\ntrace clock:%pS", cpu_buffer->cpu, ts + info->delta, info->ts, info->delta, info->before, info->after, - full ? " (full)" : "", show_interrupt_level()); + full ? " (full)" : "", show_interrupt_level(), + cpu_buffer->buffer->clock); } out: atomic_dec(this_cpu_ptr(&checking)); @@ -5274,10 +5347,61 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_overruns); +static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries)); + local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun)); + local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched)); + local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost)); + + return rb_num_of_entries(cpu_buffer); +} + +static void rb_update_remote_head(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *next, *orig; + int retry = 3; + + orig = next = cpu_buffer->head_page; + rb_inc_page(&next); + + /* Run after the writer */ + while (cpu_buffer->head_page->page->time_stamp > next->page->time_stamp) { + rb_inc_page(&next); + + rb_list_head_clear(cpu_buffer->head_page->list.prev); + rb_inc_page(&cpu_buffer->head_page); + rb_set_list_to_head(cpu_buffer->head_page->list.prev); + + if (cpu_buffer->head_page == orig) { + if (WARN_ON_ONCE(!(--retry))) + return; + } + } + + orig = cpu_buffer->commit_page = cpu_buffer->head_page; + retry = 3; + + while (cpu_buffer->commit_page->page->time_stamp < next->page->time_stamp) { + rb_inc_page(&next); + rb_inc_page(&cpu_buffer->commit_page); + + if (cpu_buffer->commit_page == orig) { + if (WARN_ON_ONCE(!(--retry))) + return; + } + } +} + static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + if (cpu_buffer->remote) { + rb_read_remote_meta_page(cpu_buffer); + rb_update_remote_head(cpu_buffer); + } + /* Iterator usage is expected to have record disabled */ iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; @@ -5428,7 +5552,65 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, } static struct buffer_page * -rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *new_reader, *prev_reader, *prev_head, *new_head, *last; + + if (!rb_read_remote_meta_page(cpu_buffer)) + return NULL; + + /* More to read on the reader page */ + if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) { + if (!cpu_buffer->reader_page->read) + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + return cpu_buffer->reader_page; + } + + prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu, + cpu_buffer->remote->priv)); + /* nr_pages doesn't include the reader page */ + if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages)) + return NULL; + + new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(prev_reader == new_reader); + + prev_head = new_reader; /* New reader was also the previous head */ + new_head = prev_head; + rb_inc_page(&new_head); + last = prev_head; + rb_dec_page(&last); + + /* Clear the old HEAD flag */ + rb_list_head_clear(cpu_buffer->head_page->list.prev); + + prev_reader->list.next = prev_head->list.next; + prev_reader->list.prev = prev_head->list.prev; + + /* Swap prev_reader with new_reader */ + last->list.next = &prev_reader->list; + new_head->list.prev = &prev_reader->list; + + new_reader->list.prev = &new_reader->list; + new_reader->list.next = &new_head->list; + + /* Reactivate the HEAD flag */ + rb_set_list_to_head(&last->list); + + cpu_buffer->head_page = new_head; + cpu_buffer->reader_page = new_reader; + cpu_buffer->pages = &new_head->list; + cpu_buffer->read_stamp = new_reader->page->time_stamp; + cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events; + + return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL; +} + +static struct buffer_page * +__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); @@ -5598,6 +5780,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) return reader; } +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) : + __rb_get_reader_page(cpu_buffer); +} + static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_event *event; @@ -6154,6 +6343,8 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) meta->entries = local_read(&cpu_buffer->entries); meta->overrun = local_read(&cpu_buffer->overrun); meta->read = cpu_buffer->read; + meta->pages_lost = local_read(&cpu_buffer->pages_lost); + meta->pages_touched = local_read(&cpu_buffer->pages_touched); /* Some archs do not have data cache coherency between kernel and user-space */ flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE); @@ -6164,6 +6355,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *page; + if (cpu_buffer->remote) { + if (!cpu_buffer->remote->reset) + return; + + cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv); + rb_read_remote_meta_page(cpu_buffer); + + /* Read related values, not covered by the meta-page */ + local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; + cpu_buffer->last_overrun = 0; + cpu_buffer->reader_page->read = 0; + + return; + } + rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page @@ -6394,6 +6602,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + if (rb_read_remote_meta_page(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + + return 0; + } + + guard(cpus_read_lock)(); + + /* + * Make sure all the ring buffers are up to date before we start reading + * them. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + rb_read_remote_meta_page(cpu_buffer); + } + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + if (rb_num_of_entries(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + } + + return 0; +} + #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers @@ -6632,6 +6880,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, unsigned int commit; unsigned int read; u64 save_timestamp; + bool force_memcpy; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -1; @@ -6669,6 +6918,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; + force_memcpy = cpu_buffer->mapped || cpu_buffer->remote; + /* * If this page has been partially read or * if len is not big enough to read the rest of the page or @@ -6678,7 +6929,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, */ if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || - cpu_buffer->mapped) { + force_memcpy) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -7034,7 +7285,7 @@ static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) } static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long *subbuf_ids) + struct buffer_page **subbuf_ids) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; @@ -7043,7 +7294,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, int id = 0; id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id); - subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page; + subbuf_ids[id++] = cpu_buffer->reader_page; cnt++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); @@ -7053,7 +7304,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, if (WARN_ON(id >= nr_subbufs)) break; - subbuf_ids[id] = (unsigned long)subbuf->page; + subbuf_ids[id] = subbuf; rb_inc_page(&subbuf); id++; @@ -7062,7 +7313,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, WARN_ON(cnt != nr_subbufs); - /* install subbuf ID to kern VA translation */ + /* install subbuf ID to bpage translation */ cpu_buffer->subbuf_ids = subbuf_ids; meta->meta_struct_len = sizeof(*meta); @@ -7218,13 +7469,15 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, } while (p < nr_pages) { + struct buffer_page *subbuf; struct page *page; int off = 0; if (WARN_ON_ONCE(s >= nr_subbufs)) return -EINVAL; - page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); + subbuf = cpu_buffer->subbuf_ids[s]; + page = virt_to_page((void *)subbuf->page); for (; off < (1 << (subbuf_order)); off++, page++) { if (p >= nr_pages) @@ -7251,10 +7504,11 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, struct vm_area_struct *vma) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags, *subbuf_ids; + struct buffer_page **subbuf_ids; + unsigned long flags; int err; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; @@ -7275,7 +7529,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, if (err) return err; - /* subbuf_ids include the reader while nr_pages does not */ + /* subbuf_ids includes the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); @@ -7310,6 +7564,27 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, return err; } +/* + * This is called when a VMA is duplicated (e.g., on fork()) to increment + * the user_mapped counter without remapping pages. + */ +void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (WARN_ON(!cpumask_test_cpu(cpu, buffer->cpumask))) + return; + + cpu_buffer = buffer->buffers[cpu]; + + guard(mutex)(&cpu_buffer->mapping_lock); + + if (cpu_buffer->user_mapped) + __rb_inc_dec_mapped(cpu_buffer, true); + else + WARN(1, "Unexpected buffer stat, it should be mapped"); +} + int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; @@ -7447,6 +7722,12 @@ out: return 0; } +static void rb_cpu_sync(void *data) +{ + /* Not really needed, but documents what is happening */ + smp_rmb(); +} + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in @@ -7485,7 +7766,18 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) cpu); return -ENOMEM; } - smp_wmb(); + + /* + * Ensure trace_buffer readers observe the newly allocated + * ring_buffer_per_cpu before they check the cpumask. Instead of using a + * read barrier for all readers, send an IPI. + */ + if (unlikely(system_state == SYSTEM_RUNNING)) { + on_each_cpu(rb_cpu_sync, NULL, 1); + /* Not really needed, but documents what is happening */ + smp_wmb(); + } + cpumask_set_cpu(cpu, buffer->cpumask); return 0; } diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 5b4be87ba59d..3884b14df375 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -23,6 +23,19 @@ config LTL_MON_EVENTS_ID config RV_LTL_MONITOR bool +config RV_HA_MONITOR + bool + +config HA_MON_EVENTS_IMPLICIT + select DA_MON_EVENTS_IMPLICIT + select RV_HA_MONITOR + bool + +config HA_MON_EVENTS_ID + select DA_MON_EVENTS_ID + select RV_HA_MONITOR + bool + menuconfig RV bool "Runtime Verification" select TRACING @@ -65,6 +78,11 @@ source "kernel/trace/rv/monitors/pagefault/Kconfig" source "kernel/trace/rv/monitors/sleep/Kconfig" # Add new rtapp monitors here +source "kernel/trace/rv/monitors/stall/Kconfig" +source "kernel/trace/rv/monitors/deadline/Kconfig" +source "kernel/trace/rv/monitors/nomiss/Kconfig" +# Add new deadline monitors here + # Add new monitors here config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 750e4ad6fa0f..94498da35b37 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -17,6 +17,9 @@ obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o +obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o +obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o +obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/deadline/Kconfig b/kernel/trace/rv/monitors/deadline/Kconfig new file mode 100644 index 000000000000..38804a6ad91d --- /dev/null +++ b/kernel/trace/rv/monitors/deadline/Kconfig @@ -0,0 +1,10 @@ +config RV_MON_DEADLINE + depends on RV + bool "deadline monitor" + help + Collection of monitors to check the deadline scheduler and server + behave according to specifications. Enable this to enable all + scheduler specification supported by the current kernel. + + For further information, see: + Documentation/trace/rv/monitor_deadline.rst diff --git a/kernel/trace/rv/monitors/deadline/deadline.c b/kernel/trace/rv/monitors/deadline/deadline.c new file mode 100644 index 000000000000..d566d4542ebf --- /dev/null +++ b/kernel/trace/rv/monitors/deadline/deadline.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <linux/kallsyms.h> + +#define MODULE_NAME "deadline" + +#include "deadline.h" + +struct rv_monitor rv_deadline = { + .name = "deadline", + .description = "container for several deadline scheduler specifications.", + .enable = NULL, + .disable = NULL, + .reset = NULL, + .enabled = 0, +}; + +/* Used by other monitors */ +struct sched_class *rv_ext_sched_class; + +static int __init register_deadline(void) +{ + if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT)) { + rv_ext_sched_class = (void *)kallsyms_lookup_name("ext_sched_class"); + if (!rv_ext_sched_class) + pr_warn("rv: Missing ext_sched_class, monitors may not work.\n"); + } + return rv_register_monitor(&rv_deadline, NULL); +} + +static void __exit unregister_deadline(void) +{ + rv_unregister_monitor(&rv_deadline); +} + +module_init(register_deadline); +module_exit(unregister_deadline); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("deadline: container for several deadline scheduler specifications."); diff --git a/kernel/trace/rv/monitors/deadline/deadline.h b/kernel/trace/rv/monitors/deadline/deadline.h new file mode 100644 index 000000000000..0bbfd2543329 --- /dev/null +++ b/kernel/trace/rv/monitors/deadline/deadline.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/kernel.h> +#include <linux/uaccess.h> +#include <linux/sched/deadline.h> +#include <asm/syscall.h> +#include <uapi/linux/sched/types.h> +#include <trace/events/sched.h> + +/* + * Dummy values if not available + */ +#ifndef __NR_sched_setscheduler +#define __NR_sched_setscheduler -__COUNTER__ +#endif +#ifndef __NR_sched_setattr +#define __NR_sched_setattr -__COUNTER__ +#endif + +extern struct rv_monitor rv_deadline; +/* Initialised when registering the deadline container */ +extern struct sched_class *rv_ext_sched_class; + +/* + * If both have dummy values, the syscalls are not supported and we don't even + * need to register the handler. + */ +static inline bool should_skip_syscall_handle(void) +{ + return __NR_sched_setattr < 0 && __NR_sched_setscheduler < 0; +} + +/* + * is_supported_type - return true if @type is supported by the deadline monitors + */ +static inline bool is_supported_type(u8 type) +{ + return type == DL_TASK || type == DL_SERVER_FAIR || type == DL_SERVER_EXT; +} + +/* + * is_server_type - return true if @type is a supported server + */ +static inline bool is_server_type(u8 type) +{ + return is_supported_type(type) && type != DL_TASK; +} + +/* + * Use negative numbers for the server. + * Currently only one fair server per CPU, may change in the future. + */ +#define fair_server_id(cpu) (-cpu) +#define ext_server_id(cpu) (-cpu - num_possible_cpus()) +#define NO_SERVER_ID (-2 * num_possible_cpus()) +/* + * Get a unique id used for dl entities + * + * The cpu is not required for tasks as the pid is used there, if this function + * is called on a dl_se that for sure corresponds to a task, DL_TASK can be + * used in place of cpu. + * We need the cpu for servers as it is provided in the tracepoint and we + * cannot easily retrieve it from the dl_se (requires the struct rq definition). + */ +static inline int get_entity_id(struct sched_dl_entity *dl_se, int cpu, u8 type) +{ + if (dl_server(dl_se) && type != DL_TASK) { + if (type == DL_SERVER_FAIR) + return fair_server_id(cpu); + if (type == DL_SERVER_EXT) + return ext_server_id(cpu); + return NO_SERVER_ID; + } + return dl_task_of(dl_se)->pid; +} + +static inline bool task_is_scx_enabled(struct task_struct *tsk) +{ + return IS_ENABLED(CONFIG_SCHED_CLASS_EXT) && + tsk->sched_class == rv_ext_sched_class; +} + +/* Expand id and target as arguments for da functions */ +#define EXPAND_ID(dl_se, cpu, type) get_entity_id(dl_se, cpu, type), dl_se +#define EXPAND_ID_TASK(tsk) get_entity_id(&tsk->dl, task_cpu(tsk), DL_TASK), &tsk->dl + +static inline u8 get_server_type(struct task_struct *tsk) +{ + if (tsk->policy == SCHED_NORMAL || tsk->policy == SCHED_EXT || + tsk->policy == SCHED_BATCH || tsk->policy == SCHED_IDLE) + return task_is_scx_enabled(tsk) ? DL_SERVER_EXT : DL_SERVER_FAIR; + return DL_OTHER; +} + +static inline int extract_params(struct pt_regs *regs, long id, pid_t *pid_out) +{ + size_t size = offsetofend(struct sched_attr, sched_flags); + struct sched_attr __user *uattr, attr; + int new_policy = -1, ret; + unsigned long args[6]; + + switch (id) { + case __NR_sched_setscheduler: + syscall_get_arguments(current, regs, args); + *pid_out = args[0]; + new_policy = args[1]; + break; + case __NR_sched_setattr: + syscall_get_arguments(current, regs, args); + *pid_out = args[0]; + uattr = (struct sched_attr __user *)args[1]; + /* + * Just copy up to sched_flags, we are not interested after that + */ + ret = copy_struct_from_user(&attr, size, uattr, size); + if (ret) + return ret; + if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) + return -EINVAL; + new_policy = attr.sched_policy; + break; + default: + return -EINVAL; + } + + return new_policy & ~SCHED_RESET_ON_FORK; +} + +/* Helper functions requiring DA/HA utilities */ +#ifdef RV_MON_TYPE + +/* + * get_fair_server - get the fair server associated to a task + * + * If the task is a boosted task, the server is available in the task_struct, + * otherwise grab the dl entity saved for the CPU where the task is enqueued. + * This function assumes the task is enqueued somewhere. + */ +static inline struct sched_dl_entity *get_server(struct task_struct *tsk, u8 type) +{ + if (tsk->dl_server && get_server_type(tsk) == type) + return tsk->dl_server; + if (type == DL_SERVER_FAIR) + return da_get_target_by_id(fair_server_id(task_cpu(tsk))); + if (type == DL_SERVER_EXT) + return da_get_target_by_id(ext_server_id(task_cpu(tsk))); + return NULL; +} + +/* + * Initialise monitors for all tasks and pre-allocate the storage for servers. + * This is necessary since we don't have access to the servers here and + * allocation can cause deadlocks from their tracepoints. We can only fill + * pre-initialised storage from there. + */ +static inline int init_storage(bool skip_tasks) +{ + struct task_struct *g, *p; + int cpu; + + for_each_possible_cpu(cpu) { + if (!da_create_empty_storage(fair_server_id(cpu))) + goto fail; + if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT) && + !da_create_empty_storage(ext_server_id(cpu))) + goto fail; + } + + if (skip_tasks) + return 0; + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p->policy == SCHED_DEADLINE) { + if (!da_create_storage(EXPAND_ID_TASK(p), NULL)) { + read_unlock(&tasklist_lock); + goto fail; + } + } + } + read_unlock(&tasklist_lock); + return 0; + +fail: + da_monitor_destroy(); + return -ENOMEM; +} + +static void __maybe_unused handle_newtask(void *data, struct task_struct *task, u64 flags) +{ + /* Might be superfluous as tasks are not started with this policy.. */ + if (task->policy == SCHED_DEADLINE) + da_create_storage(EXPAND_ID_TASK(task), NULL); +} + +static void __maybe_unused handle_exit(void *data, struct task_struct *p, bool group_dead) +{ + if (p->policy == SCHED_DEADLINE) + da_destroy_storage(get_entity_id(&p->dl, DL_TASK, DL_TASK)); +} + +#endif diff --git a/kernel/trace/rv/monitors/nomiss/Kconfig b/kernel/trace/rv/monitors/nomiss/Kconfig new file mode 100644 index 000000000000..e1886c3a0dd9 --- /dev/null +++ b/kernel/trace/rv/monitors/nomiss/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_NOMISS + depends on RV + depends on HAVE_SYSCALL_TRACEPOINTS + depends on RV_MON_DEADLINE + default y + select HA_MON_EVENTS_ID + bool "nomiss monitor" + help + Monitor to ensure dl entities run to completion before their deadiline. + This monitor is part of the deadline monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_deadline.rst diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.c b/kernel/trace/rv/monitors/nomiss/nomiss.c new file mode 100644 index 000000000000..31f90f3638d8 --- /dev/null +++ b/kernel/trace/rv/monitors/nomiss/nomiss.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "nomiss" + +#include <uapi/linux/sched/types.h> +#include <trace/events/syscalls.h> +#include <trace/events/sched.h> +#include <trace/events/task.h> +#include <rv_trace.h> + +#define RV_MON_TYPE RV_MON_PER_OBJ +#define HA_TIMER_TYPE HA_TIMER_WHEEL +/* The start condition is on sched_switch, it's dangerous to allocate there */ +#define DA_SKIP_AUTO_ALLOC +typedef struct sched_dl_entity *monitor_target; +#include "nomiss.h" +#include <rv/ha_monitor.h> +#include <monitors/deadline/deadline.h> + +/* + * User configurable deadline threshold. If the total utilisation of deadline + * tasks is larger than 1, they are only guaranteed bounded tardiness. See + * Documentation/scheduler/sched-deadline.rst for more details. + * The minimum tardiness without sched_feat(HRTICK_DL) is 1 tick to accommodate + * for throttle enforced on the next tick. + */ +static u64 deadline_thresh = TICK_NSEC; +module_param(deadline_thresh, ullong, 0644); +#define DEADLINE_NS(ha_mon) (ha_get_target(ha_mon)->dl_deadline + deadline_thresh) + +static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_nomiss env, u64 time_ns) +{ + if (env == clk_nomiss) + return ha_get_clk_ns(ha_mon, env, time_ns); + else if (env == is_constr_dl_nomiss) + return !dl_is_implicit(ha_get_target(ha_mon)); + else if (env == is_defer_nomiss) + return ha_get_target(ha_mon)->dl_defer; + return ENV_INVALID_VALUE; +} + +static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_nomiss env, u64 time_ns) +{ + if (env == clk_nomiss) + ha_reset_clk_ns(ha_mon, env, time_ns); +} + +static inline bool ha_verify_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (curr_state == ready_nomiss) + return ha_check_invariant_ns(ha_mon, clk_nomiss, time_ns); + else if (curr_state == running_nomiss) + return ha_check_invariant_ns(ha_mon, clk_nomiss, time_ns); + return true; +} + +static inline void ha_convert_inv_guard(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (curr_state == next_state) + return; + if (curr_state == ready_nomiss) + ha_inv_to_guard(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns); + else if (curr_state == running_nomiss) + ha_inv_to_guard(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns); +} + +static inline bool ha_verify_guards(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + bool res = true; + + if (curr_state == ready_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + else if (curr_state == ready_nomiss && event == dl_throttle_nomiss) + res = ha_get_env(ha_mon, is_defer_nomiss, time_ns) == 1ull; + else if (curr_state == idle_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + else if (curr_state == running_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + else if (curr_state == sleeping_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + else if (curr_state == sleeping_nomiss && event == dl_throttle_nomiss) + res = ha_get_env(ha_mon, is_constr_dl_nomiss, time_ns) == 1ull || + ha_get_env(ha_mon, is_defer_nomiss, time_ns) == 1ull; + else if (curr_state == throttled_nomiss && event == dl_replenish_nomiss) + ha_reset_env(ha_mon, clk_nomiss, time_ns); + return res; +} + +static inline void ha_setup_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (next_state == curr_state && event != dl_replenish_nomiss) + return; + if (next_state == ready_nomiss) + ha_start_timer_ns(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns); + else if (next_state == running_nomiss) + ha_start_timer_ns(ha_mon, clk_nomiss, DEADLINE_NS(ha_mon), time_ns); + else if (curr_state == ready_nomiss) + ha_cancel_timer(ha_mon); + else if (curr_state == running_nomiss) + ha_cancel_timer(ha_mon); +} + +static bool ha_verify_constraint(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + ha_convert_inv_guard(ha_mon, curr_state, event, next_state, time_ns); + + if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns); + + return true; +} + +static void handle_dl_replenish(void *data, struct sched_dl_entity *dl_se, + int cpu, u8 type) +{ + if (is_supported_type(type)) + da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_replenish_nomiss); +} + +static void handle_dl_throttle(void *data, struct sched_dl_entity *dl_se, + int cpu, u8 type) +{ + if (is_supported_type(type)) + da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_throttle_nomiss); +} + +static void handle_dl_server_stop(void *data, struct sched_dl_entity *dl_se, + int cpu, u8 type) +{ + /* + * This isn't the standard use of da_handle_start_run_event since this + * event cannot only occur from the initial state. + * It is fine to use here because it always brings to a known state and + * the fact we "pretend" the transition starts from the initial state + * has no side effect. + */ + if (is_supported_type(type)) + da_handle_start_run_event(EXPAND_ID(dl_se, cpu, type), dl_server_stop_nomiss); +} + +static inline void handle_server_switch(struct task_struct *next, int cpu, u8 type) +{ + struct sched_dl_entity *dl_se = get_server(next, type); + + if (dl_se && is_idle_task(next)) + da_handle_event(EXPAND_ID(dl_se, cpu, type), dl_server_idle_nomiss); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + int cpu = task_cpu(next); + + if (prev_state != TASK_RUNNING && !preempt && prev->policy == SCHED_DEADLINE) + da_handle_event(EXPAND_ID_TASK(prev), sched_switch_suspend_nomiss); + if (next->policy == SCHED_DEADLINE) + da_handle_start_run_event(EXPAND_ID_TASK(next), sched_switch_in_nomiss); + + /* + * The server is available in next only if the next task is boosted, + * otherwise we need to retrieve it. + * Here the server continues in the state running/armed until actually + * stopped, this works since we continue expecting a throttle. + */ + if (next->dl_server) + da_handle_start_event(EXPAND_ID(next->dl_server, cpu, + get_server_type(next)), + sched_switch_in_nomiss); + else { + handle_server_switch(next, cpu, DL_SERVER_FAIR); + if (IS_ENABLED(CONFIG_SCHED_CLASS_EXT)) + handle_server_switch(next, cpu, DL_SERVER_EXT); + } +} + +static void handle_sys_enter(void *data, struct pt_regs *regs, long id) +{ + struct task_struct *p; + int new_policy = -1; + pid_t pid = 0; + + new_policy = extract_params(regs, id, &pid); + if (new_policy < 0) + return; + guard(rcu)(); + p = pid ? find_task_by_vpid(pid) : current; + if (unlikely(!p) || new_policy == p->policy) + return; + + if (p->policy == SCHED_DEADLINE) + da_reset(EXPAND_ID_TASK(p)); + else if (new_policy == SCHED_DEADLINE) + da_create_or_get(EXPAND_ID_TASK(p)); +} + +static void handle_sched_wakeup(void *data, struct task_struct *tsk) +{ + if (tsk->policy == SCHED_DEADLINE) + da_handle_event(EXPAND_ID_TASK(tsk), sched_wakeup_nomiss); +} + +static int enable_nomiss(void) +{ + int retval; + + retval = da_monitor_init(); + if (retval) + return retval; + + retval = init_storage(false); + if (retval) + return retval; + rv_attach_trace_probe("nomiss", sched_dl_replenish_tp, handle_dl_replenish); + rv_attach_trace_probe("nomiss", sched_dl_throttle_tp, handle_dl_throttle); + rv_attach_trace_probe("nomiss", sched_dl_server_stop_tp, handle_dl_server_stop); + rv_attach_trace_probe("nomiss", sched_switch, handle_sched_switch); + rv_attach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup); + if (!should_skip_syscall_handle()) + rv_attach_trace_probe("nomiss", sys_enter, handle_sys_enter); + rv_attach_trace_probe("nomiss", task_newtask, handle_newtask); + rv_attach_trace_probe("nomiss", sched_process_exit, handle_exit); + + return 0; +} + +static void disable_nomiss(void) +{ + rv_this.enabled = 0; + + /* Those are RCU writers, detach earlier hoping to close a bit faster */ + rv_detach_trace_probe("nomiss", task_newtask, handle_newtask); + rv_detach_trace_probe("nomiss", sched_process_exit, handle_exit); + if (!should_skip_syscall_handle()) + rv_detach_trace_probe("nomiss", sys_enter, handle_sys_enter); + + rv_detach_trace_probe("nomiss", sched_dl_replenish_tp, handle_dl_replenish); + rv_detach_trace_probe("nomiss", sched_dl_throttle_tp, handle_dl_throttle); + rv_detach_trace_probe("nomiss", sched_dl_server_stop_tp, handle_dl_server_stop); + rv_detach_trace_probe("nomiss", sched_switch, handle_sched_switch); + rv_detach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup); + + da_monitor_destroy(); +} + +static struct rv_monitor rv_this = { + .name = "nomiss", + .description = "dl entities run to completion before their deadline.", + .enable = enable_nomiss, + .disable = disable_nomiss, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_nomiss(void) +{ + return rv_register_monitor(&rv_this, &rv_deadline); +} + +static void __exit unregister_nomiss(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_nomiss); +module_exit(unregister_nomiss); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("nomiss: dl entities run to completion before their deadline."); diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.h b/kernel/trace/rv/monitors/nomiss/nomiss.h new file mode 100644 index 000000000000..3d1b436194d7 --- /dev/null +++ b/kernel/trace/rv/monitors/nomiss/nomiss.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of nomiss automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME nomiss + +enum states_nomiss { + ready_nomiss, + idle_nomiss, + running_nomiss, + sleeping_nomiss, + throttled_nomiss, + state_max_nomiss, +}; + +#define INVALID_STATE state_max_nomiss + +enum events_nomiss { + dl_replenish_nomiss, + dl_server_idle_nomiss, + dl_server_stop_nomiss, + dl_throttle_nomiss, + sched_switch_in_nomiss, + sched_switch_suspend_nomiss, + sched_wakeup_nomiss, + event_max_nomiss, +}; + +enum envs_nomiss { + clk_nomiss, + is_constr_dl_nomiss, + is_defer_nomiss, + env_max_nomiss, + env_max_stored_nomiss = is_constr_dl_nomiss, +}; + +_Static_assert(env_max_stored_nomiss <= MAX_HA_ENV_LEN, "Not enough slots"); +#define HA_CLK_NS + +struct automaton_nomiss { + char *state_names[state_max_nomiss]; + char *event_names[event_max_nomiss]; + char *env_names[env_max_nomiss]; + unsigned char function[state_max_nomiss][event_max_nomiss]; + unsigned char initial_state; + bool final_states[state_max_nomiss]; +}; + +static const struct automaton_nomiss automaton_nomiss = { + .state_names = { + "ready", + "idle", + "running", + "sleeping", + "throttled", + }, + .event_names = { + "dl_replenish", + "dl_server_idle", + "dl_server_stop", + "dl_throttle", + "sched_switch_in", + "sched_switch_suspend", + "sched_wakeup", + }, + .env_names = { + "clk", + "is_constr_dl", + "is_defer", + }, + .function = { + { + ready_nomiss, + idle_nomiss, + sleeping_nomiss, + throttled_nomiss, + running_nomiss, + INVALID_STATE, + ready_nomiss, + }, + { + ready_nomiss, + idle_nomiss, + sleeping_nomiss, + throttled_nomiss, + running_nomiss, + INVALID_STATE, + INVALID_STATE, + }, + { + running_nomiss, + idle_nomiss, + sleeping_nomiss, + throttled_nomiss, + running_nomiss, + sleeping_nomiss, + running_nomiss, + }, + { + ready_nomiss, + sleeping_nomiss, + sleeping_nomiss, + throttled_nomiss, + running_nomiss, + INVALID_STATE, + ready_nomiss, + }, + { + ready_nomiss, + throttled_nomiss, + INVALID_STATE, + throttled_nomiss, + INVALID_STATE, + throttled_nomiss, + throttled_nomiss, + }, + }, + .initial_state = ready_nomiss, + .final_states = { 1, 0, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/nomiss/nomiss_trace.h b/kernel/trace/rv/monitors/nomiss/nomiss_trace.h new file mode 100644 index 000000000000..42e7efaca4e7 --- /dev/null +++ b/kernel/trace/rv/monitors/nomiss/nomiss_trace.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_NOMISS +DEFINE_EVENT(event_da_monitor_id, event_nomiss, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_nomiss, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); + +DEFINE_EVENT(error_env_da_monitor_id, error_env_nomiss, + TP_PROTO(int id, char *state, char *event, char *env), + TP_ARGS(id, state, event, env)); +#endif /* CONFIG_RV_MON_NOMISS */ diff --git a/kernel/trace/rv/monitors/opid/Kconfig b/kernel/trace/rv/monitors/opid/Kconfig index 561d32da572b..6d02e239b684 100644 --- a/kernel/trace/rv/monitors/opid/Kconfig +++ b/kernel/trace/rv/monitors/opid/Kconfig @@ -2,18 +2,13 @@ # config RV_MON_OPID depends on RV - depends on TRACE_IRQFLAGS - depends on TRACE_PREEMPT_TOGGLE depends on RV_MON_SCHED - default y if PREEMPT_RT - select DA_MON_EVENTS_IMPLICIT + default y + select HA_MON_EVENTS_IMPLICIT bool "opid monitor" help Monitor to ensure operations like wakeup and need resched occur with - interrupts and preemption disabled or during IRQs, where preemption - may not be disabled explicitly. - - This monitor is unstable on !PREEMPT_RT, say N unless you are testing it. + interrupts and preemption disabled. For further information, see: Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c index 25a40e90fa40..4594c7c46601 100644 --- a/kernel/trace/rv/monitors/opid/opid.c +++ b/kernel/trace/rv/monitors/opid/opid.c @@ -10,94 +10,63 @@ #define MODULE_NAME "opid" #include <trace/events/sched.h> -#include <trace/events/irq.h> -#include <trace/events/preemptirq.h> #include <rv_trace.h> #include <monitors/sched/sched.h> #define RV_MON_TYPE RV_MON_PER_CPU #include "opid.h" -#include <rv/da_monitor.h> +#include <rv/ha_monitor.h> -#ifdef CONFIG_X86_LOCAL_APIC -#include <asm/trace/irq_vectors.h> - -static void handle_vector_irq_entry(void *data, int vector) +static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_opid env, u64 time_ns) { - da_handle_event(irq_entry_opid); -} - -static void attach_vector_irq(void) -{ - rv_attach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry); - if (IS_ENABLED(CONFIG_IRQ_WORK)) - rv_attach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry); - if (IS_ENABLED(CONFIG_SMP)) { - rv_attach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry); - rv_attach_trace_probe("opid", call_function_entry, handle_vector_irq_entry); - rv_attach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry); + if (env == irq_off_opid) + return irqs_disabled(); + else if (env == preempt_off_opid) { + /* + * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables + * preemption (adding one to the preempt_count). Since we are + * interested in the preempt_count at the time the tracepoint was + * hit, we consider 1 as still enabled. + */ + if (IS_ENABLED(CONFIG_PREEMPTION)) + return (preempt_count() & PREEMPT_MASK) > 1; + return true; } + return ENV_INVALID_VALUE; } -static void detach_vector_irq(void) +static inline bool ha_verify_guards(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) { - rv_detach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry); - if (IS_ENABLED(CONFIG_IRQ_WORK)) - rv_detach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry); - if (IS_ENABLED(CONFIG_SMP)) { - rv_detach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry); - rv_detach_trace_probe("opid", call_function_entry, handle_vector_irq_entry); - rv_detach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry); - } + bool res = true; + + if (curr_state == any_opid && event == sched_need_resched_opid) + res = ha_get_env(ha_mon, irq_off_opid, time_ns) == 1ull; + else if (curr_state == any_opid && event == sched_waking_opid) + res = ha_get_env(ha_mon, irq_off_opid, time_ns) == 1ull && + ha_get_env(ha_mon, preempt_off_opid, time_ns) == 1ull; + return res; } -#else -/* We assume irq_entry tracepoints are sufficient on other architectures */ -static void attach_vector_irq(void) { } -static void detach_vector_irq(void) { } -#endif - -static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +static bool ha_verify_constraint(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) { - da_handle_event(irq_disable_opid); -} + if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns)) + return false; -static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_event(irq_enable_opid); -} - -static void handle_irq_entry(void *data, int irq, struct irqaction *action) -{ - da_handle_event(irq_entry_opid); -} - -static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_event(preempt_disable_opid); -} - -static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_event(preempt_enable_opid); + return true; } static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif) { - /* The monitor's intitial state is not in_irq */ - if (this_cpu_read(hardirq_context)) - da_handle_event(sched_need_resched_opid); - else - da_handle_start_event(sched_need_resched_opid); + da_handle_start_run_event(sched_need_resched_opid); } static void handle_sched_waking(void *data, struct task_struct *p) { - /* The monitor's intitial state is not in_irq */ - if (this_cpu_read(hardirq_context)) - da_handle_event(sched_waking_opid); - else - da_handle_start_event(sched_waking_opid); + da_handle_start_run_event(sched_waking_opid); } static int enable_opid(void) @@ -108,14 +77,8 @@ static int enable_opid(void) if (retval) return retval; - rv_attach_trace_probe("opid", irq_disable, handle_irq_disable); - rv_attach_trace_probe("opid", irq_enable, handle_irq_enable); - rv_attach_trace_probe("opid", irq_handler_entry, handle_irq_entry); - rv_attach_trace_probe("opid", preempt_disable, handle_preempt_disable); - rv_attach_trace_probe("opid", preempt_enable, handle_preempt_enable); rv_attach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); rv_attach_trace_probe("opid", sched_waking, handle_sched_waking); - attach_vector_irq(); return 0; } @@ -124,14 +87,8 @@ static void disable_opid(void) { rv_this.enabled = 0; - rv_detach_trace_probe("opid", irq_disable, handle_irq_disable); - rv_detach_trace_probe("opid", irq_enable, handle_irq_enable); - rv_detach_trace_probe("opid", irq_handler_entry, handle_irq_entry); - rv_detach_trace_probe("opid", preempt_disable, handle_preempt_disable); - rv_detach_trace_probe("opid", preempt_enable, handle_preempt_enable); rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); rv_detach_trace_probe("opid", sched_waking, handle_sched_waking); - detach_vector_irq(); da_monitor_destroy(); } diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h index 092992514970..fb0aa4c28aa6 100644 --- a/kernel/trace/rv/monitors/opid/opid.h +++ b/kernel/trace/rv/monitors/opid/opid.h @@ -8,30 +8,31 @@ #define MONITOR_NAME opid enum states_opid { - disabled_opid, - enabled_opid, - in_irq_opid, - irq_disabled_opid, - preempt_disabled_opid, + any_opid, state_max_opid, }; #define INVALID_STATE state_max_opid enum events_opid { - irq_disable_opid, - irq_enable_opid, - irq_entry_opid, - preempt_disable_opid, - preempt_enable_opid, sched_need_resched_opid, sched_waking_opid, event_max_opid, }; +enum envs_opid { + irq_off_opid, + preempt_off_opid, + env_max_opid, + env_max_stored_opid = irq_off_opid, +}; + +_Static_assert(env_max_stored_opid <= MAX_HA_ENV_LEN, "Not enough slots"); + struct automaton_opid { char *state_names[state_max_opid]; char *event_names[event_max_opid]; + char *env_names[env_max_opid]; unsigned char function[state_max_opid][event_max_opid]; unsigned char initial_state; bool final_states[state_max_opid]; @@ -39,68 +40,19 @@ struct automaton_opid { static const struct automaton_opid automaton_opid = { .state_names = { - "disabled", - "enabled", - "in_irq", - "irq_disabled", - "preempt_disabled", + "any", }, .event_names = { - "irq_disable", - "irq_enable", - "irq_entry", - "preempt_disable", - "preempt_enable", "sched_need_resched", "sched_waking", }, + .env_names = { + "irq_off", + "preempt_off", + }, .function = { - { - INVALID_STATE, - preempt_disabled_opid, - disabled_opid, - INVALID_STATE, - irq_disabled_opid, - disabled_opid, - disabled_opid, - }, - { - irq_disabled_opid, - INVALID_STATE, - INVALID_STATE, - preempt_disabled_opid, - enabled_opid, - INVALID_STATE, - INVALID_STATE, - }, - { - INVALID_STATE, - enabled_opid, - in_irq_opid, - INVALID_STATE, - INVALID_STATE, - in_irq_opid, - in_irq_opid, - }, - { - INVALID_STATE, - enabled_opid, - in_irq_opid, - disabled_opid, - INVALID_STATE, - irq_disabled_opid, - INVALID_STATE, - }, - { - disabled_opid, - INVALID_STATE, - INVALID_STATE, - INVALID_STATE, - enabled_opid, - INVALID_STATE, - INVALID_STATE, - }, + { any_opid, any_opid }, }, - .initial_state = disabled_opid, - .final_states = { 0, 1, 0, 0, 0 }, + .initial_state = any_opid, + .final_states = { 1 }, }; diff --git a/kernel/trace/rv/monitors/opid/opid_trace.h b/kernel/trace/rv/monitors/opid/opid_trace.h index 3df6ff955c30..b04005b64208 100644 --- a/kernel/trace/rv/monitors/opid/opid_trace.h +++ b/kernel/trace/rv/monitors/opid/opid_trace.h @@ -12,4 +12,8 @@ DEFINE_EVENT(event_da_monitor, event_opid, DEFINE_EVENT(error_da_monitor, error_opid, TP_PROTO(char *state, char *event), TP_ARGS(state, event)); + +DEFINE_EVENT(error_env_da_monitor, error_env_opid, + TP_PROTO(char *state, char *event, char *env), + TP_ARGS(state, event, env)); #endif /* CONFIG_RV_MON_OPID */ diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c index c1347da69e9d..8dfe5ec13e19 100644 --- a/kernel/trace/rv/monitors/sleep/sleep.c +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -49,6 +49,7 @@ static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bo ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_EPOLL_WAIT, false); ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false); } @@ -63,6 +64,7 @@ static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bo ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); + ltl_atom_set(mon, LTL_EPOLL_WAIT, false); if (strstarts(task->comm, "migration/")) ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true); @@ -162,6 +164,11 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) break; } break; +#ifdef __NR_epoll_wait + case __NR_epoll_wait: + ltl_atom_update(current, LTL_EPOLL_WAIT, true); + break; +#endif } } @@ -174,6 +181,7 @@ static void handle_sys_exit(void *data, struct pt_regs *regs, long ret) ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_EPOLL_WAIT, false); ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false); } diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h index 2ab46fd218d2..95dc2727c059 100644 --- a/kernel/trace/rv/monitors/sleep/sleep.h +++ b/kernel/trace/rv/monitors/sleep/sleep.h @@ -15,6 +15,7 @@ enum ltl_atom { LTL_ABORT_SLEEP, LTL_BLOCK_ON_RT_MUTEX, LTL_CLOCK_NANOSLEEP, + LTL_EPOLL_WAIT, LTL_FUTEX_LOCK_PI, LTL_FUTEX_WAIT, LTL_KERNEL_THREAD, @@ -40,6 +41,7 @@ static const char *ltl_atom_str(enum ltl_atom atom) "ab_sl", "bl_on_rt_mu", "cl_na", + "ep_wa", "fu_lo_pi", "fu_wa", "ker_th", @@ -75,39 +77,41 @@ static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES); static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) { - bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); - bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); - bool val40 = task_is_rcu || task_is_migration; - bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); - bool val41 = futex_lock_pi || val40; - bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); - bool val5 = block_on_rt_mutex || val41; - bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); - bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); - bool val32 = abort_sleep || kthread_should_stop; bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); - bool val33 = woken_by_nmi || val32; bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); - bool val34 = woken_by_hardirq || val33; bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, mon->atoms); - bool val14 = woken_by_equal_or_higher_prio || val34; bool wake = test_bit(LTL_WAKE, mon->atoms); - bool val13 = !wake; - bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool rt = test_bit(LTL_RT, mon->atoms); + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); - bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai; - bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); - bool val25 = nanosleep_timer_abstime && val24; - bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); - bool val18 = clock_nanosleep && val25; + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); - bool val9 = futex_wait || val18; + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms); + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val42 = task_is_rcu || task_is_migration; + bool val43 = futex_lock_pi || val42; + bool val5 = block_on_rt_mutex || val43; + bool val34 = abort_sleep || kthread_should_stop; + bool val35 = woken_by_nmi || val34; + bool val36 = woken_by_hardirq || val35; + bool val14 = woken_by_equal_or_higher_prio || val36; + bool val13 = !wake; + bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool val27 = nanosleep_timer_abstime && val26; + bool val18 = clock_nanosleep && val27; + bool val20 = val18 || epoll_wait; + bool val9 = futex_wait || val20; bool val11 = val9 || kernel_thread; - bool sleep = test_bit(LTL_SLEEP, mon->atoms); bool val2 = !sleep; - bool rt = test_bit(LTL_RT, mon->atoms); bool val1 = !rt; bool val3 = val1 || val2; @@ -124,39 +128,41 @@ static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) static void ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next) { - bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); - bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); - bool val40 = task_is_rcu || task_is_migration; - bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); - bool val41 = futex_lock_pi || val40; - bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); - bool val5 = block_on_rt_mutex || val41; - bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); - bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); - bool val32 = abort_sleep || kthread_should_stop; bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); - bool val33 = woken_by_nmi || val32; bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); - bool val34 = woken_by_hardirq || val33; bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, mon->atoms); - bool val14 = woken_by_equal_or_higher_prio || val34; bool wake = test_bit(LTL_WAKE, mon->atoms); - bool val13 = !wake; - bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool rt = test_bit(LTL_RT, mon->atoms); + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); - bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai; - bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); - bool val25 = nanosleep_timer_abstime && val24; - bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); - bool val18 = clock_nanosleep && val25; + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); - bool val9 = futex_wait || val18; + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms); + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val42 = task_is_rcu || task_is_migration; + bool val43 = futex_lock_pi || val42; + bool val5 = block_on_rt_mutex || val43; + bool val34 = abort_sleep || kthread_should_stop; + bool val35 = woken_by_nmi || val34; + bool val36 = woken_by_hardirq || val35; + bool val14 = woken_by_equal_or_higher_prio || val36; + bool val13 = !wake; + bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool val27 = nanosleep_timer_abstime && val26; + bool val18 = clock_nanosleep && val27; + bool val20 = val18 || epoll_wait; + bool val9 = futex_wait || val20; bool val11 = val9 || kernel_thread; - bool sleep = test_bit(LTL_SLEEP, mon->atoms); bool val2 = !sleep; - bool rt = test_bit(LTL_RT, mon->atoms); bool val1 = !rt; bool val3 = val1 || val2; diff --git a/kernel/trace/rv/monitors/stall/Kconfig b/kernel/trace/rv/monitors/stall/Kconfig new file mode 100644 index 000000000000..6f846b642544 --- /dev/null +++ b/kernel/trace/rv/monitors/stall/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_STALL + depends on RV + select HA_MON_EVENTS_ID + bool "stall monitor" + help + Enable the stall sample monitor that illustrates the usage of hybrid + automata monitors. It can be used to identify tasks stalled for + longer than a threshold. + + For further information, see: + Documentation/trace/rv/monitor_stall.rst diff --git a/kernel/trace/rv/monitors/stall/stall.c b/kernel/trace/rv/monitors/stall/stall.c new file mode 100644 index 000000000000..9ccfda6b0e73 --- /dev/null +++ b/kernel/trace/rv/monitors/stall/stall.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> + +#define MODULE_NAME "stall" + +#include <trace/events/sched.h> +#include <rv_trace.h> + +#define RV_MON_TYPE RV_MON_PER_TASK +#define HA_TIMER_TYPE HA_TIMER_WHEEL +#include "stall.h" +#include <rv/ha_monitor.h> + +static u64 threshold_jiffies = 1000; +module_param(threshold_jiffies, ullong, 0644); + +static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_stall env, u64 time_ns) +{ + if (env == clk_stall) + return ha_get_clk_jiffy(ha_mon, env); + return ENV_INVALID_VALUE; +} + +static void ha_reset_env(struct ha_monitor *ha_mon, enum envs_stall env, u64 time_ns) +{ + if (env == clk_stall) + ha_reset_clk_jiffy(ha_mon, env); +} + +static inline bool ha_verify_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (curr_state == enqueued_stall) + return ha_check_invariant_jiffy(ha_mon, clk_stall, time_ns); + return true; +} + +static inline bool ha_verify_guards(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + bool res = true; + + if (curr_state == dequeued_stall && event == sched_wakeup_stall) + ha_reset_env(ha_mon, clk_stall, time_ns); + else if (curr_state == running_stall && event == sched_switch_preempt_stall) + ha_reset_env(ha_mon, clk_stall, time_ns); + return res; +} + +static inline void ha_setup_invariants(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (next_state == curr_state) + return; + if (next_state == enqueued_stall) + ha_start_timer_jiffy(ha_mon, clk_stall, threshold_jiffies, time_ns); + else if (curr_state == enqueued_stall) + ha_cancel_timer(ha_mon); +} + +static bool ha_verify_constraint(struct ha_monitor *ha_mon, + enum states curr_state, enum events event, + enum states next_state, u64 time_ns) +{ + if (!ha_verify_invariants(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + if (!ha_verify_guards(ha_mon, curr_state, event, next_state, time_ns)) + return false; + + ha_setup_invariants(ha_mon, curr_state, event, next_state, time_ns); + + return true; +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + if (!preempt && prev_state != TASK_RUNNING) + da_handle_start_event(prev, sched_switch_wait_stall); + else + da_handle_event(prev, sched_switch_preempt_stall); + da_handle_event(next, sched_switch_in_stall); +} + +static void handle_sched_wakeup(void *data, struct task_struct *p) +{ + da_handle_event(p, sched_wakeup_stall); +} + +static int enable_stall(void) +{ + int retval; + + retval = da_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("stall", sched_switch, handle_sched_switch); + rv_attach_trace_probe("stall", sched_wakeup, handle_sched_wakeup); + + return 0; +} + +static void disable_stall(void) +{ + rv_this.enabled = 0; + + rv_detach_trace_probe("stall", sched_switch, handle_sched_switch); + rv_detach_trace_probe("stall", sched_wakeup, handle_sched_wakeup); + + da_monitor_destroy(); +} + +static struct rv_monitor rv_this = { + .name = "stall", + .description = "identify tasks stalled for longer than a threshold.", + .enable = enable_stall, + .disable = disable_stall, + .reset = da_monitor_reset_all, + .enabled = 0, +}; + +static int __init register_stall(void) +{ + return rv_register_monitor(&rv_this, NULL); +} + +static void __exit unregister_stall(void) +{ + rv_unregister_monitor(&rv_this); +} + +module_init(register_stall); +module_exit(unregister_stall); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("stall: identify tasks stalled for longer than a threshold."); diff --git a/kernel/trace/rv/monitors/stall/stall.h b/kernel/trace/rv/monitors/stall/stall.h new file mode 100644 index 000000000000..638520cb1082 --- /dev/null +++ b/kernel/trace/rv/monitors/stall/stall.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of stall automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +#define MONITOR_NAME stall + +enum states_stall { + dequeued_stall, + enqueued_stall, + running_stall, + state_max_stall, +}; + +#define INVALID_STATE state_max_stall + +enum events_stall { + sched_switch_in_stall, + sched_switch_preempt_stall, + sched_switch_wait_stall, + sched_wakeup_stall, + event_max_stall, +}; + +enum envs_stall { + clk_stall, + env_max_stall, + env_max_stored_stall = env_max_stall, +}; + +_Static_assert(env_max_stored_stall <= MAX_HA_ENV_LEN, "Not enough slots"); + +struct automaton_stall { + char *state_names[state_max_stall]; + char *event_names[event_max_stall]; + char *env_names[env_max_stall]; + unsigned char function[state_max_stall][event_max_stall]; + unsigned char initial_state; + bool final_states[state_max_stall]; +}; + +static const struct automaton_stall automaton_stall = { + .state_names = { + "dequeued", + "enqueued", + "running", + }, + .event_names = { + "sched_switch_in", + "sched_switch_preempt", + "sched_switch_wait", + "sched_wakeup", + }, + .env_names = { + "clk", + }, + .function = { + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + enqueued_stall, + }, + { + running_stall, + INVALID_STATE, + INVALID_STATE, + enqueued_stall, + }, + { + running_stall, + enqueued_stall, + dequeued_stall, + running_stall, + }, + }, + .initial_state = dequeued_stall, + .final_states = { 1, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/stall/stall_trace.h b/kernel/trace/rv/monitors/stall/stall_trace.h new file mode 100644 index 000000000000..6a7cc1b1d040 --- /dev/null +++ b/kernel/trace/rv/monitors/stall/stall_trace.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_STALL +DEFINE_EVENT(event_da_monitor_id, event_stall, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_stall, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); + +DEFINE_EVENT(error_env_da_monitor_id, error_env_stall, + TP_PROTO(int id, char *state, char *event, char *env), + TP_ARGS(id, state, event, env)); +#endif /* CONFIG_RV_MON_STALL */ diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 4a6faddac614..9622c269789c 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -62,9 +62,39 @@ DECLARE_EVENT_CLASS(error_da_monitor, #include <monitors/scpd/scpd_trace.h> #include <monitors/snep/snep_trace.h> #include <monitors/sts/sts_trace.h> -#include <monitors/opid/opid_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here +#ifdef CONFIG_HA_MON_EVENTS_IMPLICIT +/* For simplicity this class is marked as DA although relevant only for HA */ +DECLARE_EVENT_CLASS(error_env_da_monitor, + + TP_PROTO(char *state, char *event, char *env), + + TP_ARGS(state, event, env), + + TP_STRUCT__entry( + __string( state, state ) + __string( event, event ) + __string( env, env ) + ), + + TP_fast_assign( + __assign_str(state); + __assign_str(event); + __assign_str(env); + ), + + TP_printk("event %s not expected in the state %s with env %s", + __get_str(event), + __get_str(state), + __get_str(env)) +); + +#include <monitors/opid/opid_trace.h> +// Add new monitors based on CONFIG_HA_MON_EVENTS_IMPLICIT here + +#endif + #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ #ifdef CONFIG_DA_MON_EVENTS_ID @@ -128,6 +158,41 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, #include <monitors/sssw/sssw_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here +#ifdef CONFIG_HA_MON_EVENTS_ID +/* For simplicity this class is marked as DA although relevant only for HA */ +DECLARE_EVENT_CLASS(error_env_da_monitor_id, + + TP_PROTO(int id, char *state, char *event, char *env), + + TP_ARGS(id, state, event, env), + + TP_STRUCT__entry( + __field( int, id ) + __string( state, state ) + __string( event, event ) + __string( env, env ) + ), + + TP_fast_assign( + __assign_str(state); + __assign_str(event); + __assign_str(env); + __entry->id = id; + ), + + TP_printk("%d: event %s not expected in the state %s with env %s", + __entry->id, + __get_str(event), + __get_str(state), + __get_str(env)) +); + +#include <monitors/stall/stall_trace.h> +#include <monitors/nomiss/nomiss_trace.h> +// Add new monitors based on CONFIG_HA_MON_EVENTS_ID here + +#endif + #endif /* CONFIG_DA_MON_EVENTS_ID */ #ifdef CONFIG_LTL_MON_EVENTS_ID DECLARE_EVENT_CLASS(event_ltl_monitor_id, diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c new file mode 100644 index 000000000000..02af2297ae5a --- /dev/null +++ b/kernel/trace/simple_ring_buffer.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort <vdonnefort@google.com> + */ + +#include <linux/atomic.h> +#include <linux/simple_ring_buffer.h> + +#include <asm/barrier.h> +#include <asm/local.h> + +enum simple_rb_link_type { + SIMPLE_RB_LINK_NORMAL = 0, + SIMPLE_RB_LINK_HEAD = 1, + SIMPLE_RB_LINK_HEAD_MOVING +}; + +#define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING) + +static void simple_bpage_set_head_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + link &= SIMPLE_RB_LINK_MASK; + link |= SIMPLE_RB_LINK_HEAD; + + /* + * Paired with simple_rb_find_head() to order access between the head + * link and overrun. It ensures we always report an up-to-date value + * after swapping the reader page. + */ + smp_store_release(&bpage->link.next, (struct list_head *)link); +} + +static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage, + struct simple_buffer_page *dst, + enum simple_rb_link_type new_type) +{ + unsigned long *link = (unsigned long *)(&bpage->link.next); + unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD; + unsigned long new = (unsigned long)(&dst->link) | new_type; + + return try_cmpxchg(link, &old, new); +} + +static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK)); +} + +static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link) +{ + unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK; + + return container_of((struct list_head *)ptr, struct simple_buffer_page, link); +} + +static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage) +{ + return simple_bpage_from_link(bpage->link.next); +} + +static void simple_bpage_reset(struct simple_buffer_page *bpage) +{ + bpage->write = 0; + bpage->entries = 0; + + local_set(&bpage->page->commit, 0); +} + +static void simple_bpage_init(struct simple_buffer_page *bpage, void *page) +{ + INIT_LIST_HEAD(&bpage->link); + bpage->page = (struct buffer_data_page *)page; + + simple_bpage_reset(bpage); +} + +#define simple_rb_meta_inc(__meta, __inc) \ + WRITE_ONCE((__meta), (__meta + __inc)) + +static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer) +{ + return !!cpu_buffer->bpages; +} + +static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer) +{ + int retry = cpu_buffer->nr_pages * 2; + struct simple_buffer_page *head; + + head = cpu_buffer->head_page; + + while (retry--) { + unsigned long link; + +spin: + /* See smp_store_release in simple_bpage_set_head_link() */ + link = (unsigned long)smp_load_acquire(&head->link.prev->next); + + switch (link & ~SIMPLE_RB_LINK_MASK) { + /* Found the head */ + case SIMPLE_RB_LINK_HEAD: + cpu_buffer->head_page = head; + return 0; + /* The writer caught the head, we can spin, that won't be long */ + case SIMPLE_RB_LINK_HEAD_MOVING: + goto spin; + } + + head = simple_bpage_next_page(head); + } + + return -EBUSY; +} + +/** + * simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader + * @cpu_buffer: A simple_rb_per_cpu + * + * This function enables consuming reading. It ensures the current head page will not be overwritten + * and can be safely read. + * + * Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the + * head page. + */ +int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *last, *head, *reader; + unsigned long overrun; + int retry = 8; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + reader = cpu_buffer->reader_page; + + do { + /* Run after the writer to find the head */ + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + head = cpu_buffer->head_page; + + /* Connect the reader page around the header page */ + reader->link.next = head->link.next; + reader->link.prev = head->link.prev; + + /* The last page before the head */ + last = simple_bpage_from_link(head->link.prev); + + /* The reader page points to the new header page */ + simple_bpage_set_head_link(reader); + + overrun = cpu_buffer->meta->overrun; + } while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--); + + if (!retry) + return -EINVAL; + + cpu_buffer->head_page = simple_bpage_from_link(reader->link.next); + cpu_buffer->head_page->link.prev = &reader->link; + cpu_buffer->reader_page = head; + cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun; + cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id; + cpu_buffer->last_overrun = overrun; + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page); + +static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *tail, *new_tail; + + tail = cpu_buffer->tail_page; + new_tail = simple_bpage_next_page(tail); + + if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) { + /* + * Oh no! we've caught the head. There is none anymore and + * swap_reader will spin until we set the new one. Overrun must + * be written first, to make sure we report the correct number + * of lost events. + */ + simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries); + simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1); + + simple_bpage_set_head_link(new_tail); + simple_bpage_set_normal_link(tail); + } + + simple_bpage_reset(new_tail); + cpu_buffer->tail_page = new_tail; + + simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1); + + return new_tail; +} + +static unsigned long rb_event_size(unsigned long length) +{ + struct ring_buffer_event *event; + + return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]); +} + +static struct ring_buffer_event * +rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + + return (struct ring_buffer_event *)((unsigned long)event + 8); +} + +static struct ring_buffer_event * +simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp) +{ + unsigned long ts_ext_size = 0, event_size = rb_event_size(length); + struct simple_buffer_page *tail = cpu_buffer->tail_page; + struct ring_buffer_event *event; + u32 write, prev_write; + u64 time_delta; + + time_delta = timestamp - cpu_buffer->write_stamp; + + if (test_time_stamp(time_delta)) + ts_ext_size = 8; + + prev_write = tail->write; + write = prev_write + event_size + ts_ext_size; + + if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE))) + tail = simple_rb_move_tail(cpu_buffer); + + if (!tail->entries) { + tail->page->time_stamp = timestamp; + time_delta = 0; + ts_ext_size = 0; + write = event_size; + prev_write = 0; + } + + tail->write = write; + tail->entries++; + + cpu_buffer->write_stamp = timestamp; + + event = (struct ring_buffer_event *)(tail->page->data + prev_write); + if (ts_ext_size) { + event = rb_event_add_ts_extend(event, time_delta); + time_delta = 0; + } + + event->type_len = 0; + event->time_delta = time_delta; + event->array[0] = event_size - RB_EVNT_HDR_SIZE; + + return event; +} + +/** + * simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @length: Size of the entry in bytes + * @timestamp: Timestamp of the entry + * + * Returns the address of the entry where to write data or NULL + */ +void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, + u64 timestamp) +{ + struct ring_buffer_event *rb_event; + + if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY) + return NULL; + + rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp); + + return &rb_event->array[1]; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve); + +/** + * simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve() + * @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved + */ +void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->tail_page->page->commit, + cpu_buffer->tail_page->write); + simple_rb_meta_inc(cpu_buffer->meta->entries, 1); + + /* + * Paired with simple_rb_enable_tracing() to ensure data is + * written to the ring-buffer before teardown. + */ + smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_commit); + +static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + u32 prev_status; + + if (enable) + return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY); + + /* Wait for the buffer to be released */ + do { + prev_status = cmpxchg_acquire(&cpu_buffer->status, + SIMPLE_RB_READY, + SIMPLE_RB_UNAVAILABLE); + } while (prev_status == SIMPLE_RB_WRITING); + + return prev_status; +} + +/** + * simple_ring_buffer_reset - Reset @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * + * This will not clear the content of the data, only reset counters and pointers + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded. + */ +int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *bpage; + u32 prev_status; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + prev_status = simple_rb_enable_tracing(cpu_buffer, false); + + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + bpage = cpu_buffer->tail_page = cpu_buffer->head_page; + do { + simple_bpage_reset(bpage); + bpage = simple_bpage_next_page(bpage); + } while (bpage != cpu_buffer->head_page); + + simple_bpage_reset(cpu_buffer->reader_page); + + cpu_buffer->last_overrun = 0; + cpu_buffer->write_stamp = 0; + + cpu_buffer->meta->reader.read = 0; + cpu_buffer->meta->reader.lost_events = 0; + cpu_buffer->meta->entries = 0; + cpu_buffer->meta->overrun = 0; + cpu_buffer->meta->read = 0; + cpu_buffer->meta->pages_lost = 0; + cpu_buffer->meta->pages_touched = 0; + + if (prev_status == SIMPLE_RB_READY) + simple_rb_enable_tracing(cpu_buffer, true); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reset); + +int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, + struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc, + void *(*load_page)(unsigned long va), + void (*unload_page)(void *va)) +{ + struct simple_buffer_page *bpage = bpages; + int ret = 0; + void *page; + int i; + + /* At least 1 reader page and two pages in the ring-buffer */ + if (desc->nr_page_va < 3) + return -EINVAL; + + memset(cpu_buffer, 0, sizeof(*cpu_buffer)); + + cpu_buffer->meta = load_page(desc->meta_va); + if (!cpu_buffer->meta) + return -EINVAL; + + memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); + cpu_buffer->meta->meta_page_size = PAGE_SIZE; + cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; + + /* The reader page is not part of the ring initially */ + page = load_page(desc->page_va[0]); + if (!page) { + unload_page(cpu_buffer->meta); + return -EINVAL; + } + + simple_bpage_init(bpage, page); + bpage->id = 0; + + cpu_buffer->nr_pages = 1; + + cpu_buffer->reader_page = bpage; + cpu_buffer->tail_page = bpage + 1; + cpu_buffer->head_page = bpage + 1; + + for (i = 1; i < desc->nr_page_va; i++) { + page = load_page(desc->page_va[i]); + if (!page) { + ret = -EINVAL; + break; + } + + simple_bpage_init(++bpage, page); + + bpage->link.next = &(bpage + 1)->link; + bpage->link.prev = &(bpage - 1)->link; + bpage->id = i; + + cpu_buffer->nr_pages = i + 1; + } + + if (ret) { + for (i--; i >= 0; i--) + unload_page((void *)desc->page_va[i]); + unload_page(cpu_buffer->meta); + + return ret; + } + + /* Close the ring */ + bpage->link.next = &cpu_buffer->tail_page->link; + cpu_buffer->tail_page->link.prev = &bpage->link; + + /* The last init'ed page points to the head page */ + simple_bpage_set_head_link(bpage); + + cpu_buffer->bpages = bpages; + + return 0; +} + +static void *__load_page(unsigned long page) +{ + return (void *)page; +} + +static void __unload_page(void *page) { } + +/** + * simple_ring_buffer_init - Init @cpu_buffer based on @desc + * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. + * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va + * @desc: A ring_buffer_desc + * + * Returns 0 on success or -EINVAL if the content of @desc is invalid + */ +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc) +{ + return simple_ring_buffer_init_mm(cpu_buffer, bpages, desc, __load_page, __unload_page); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer, + void (*unload_page)(void *)) +{ + int p; + + if (!simple_rb_loaded(cpu_buffer)) + return; + + simple_rb_enable_tracing(cpu_buffer, false); + + unload_page(cpu_buffer->meta); + for (p = 0; p < cpu_buffer->nr_pages; p++) + unload_page(cpu_buffer->bpages[p].page); + + cpu_buffer->bpages = NULL; +} + +/** + * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion + * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + */ +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +{ + return simple_ring_buffer_unload_mm(cpu_buffer, __unload_page); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_unload); + +/** + * simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @enable: True to enable tracing, False to disable it + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded + */ +int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + simple_rb_enable_tracing(cpu_buffer, enable); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 23de3719f495..6eb4d3097a4d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -47,7 +47,6 @@ #include <linux/trace.h> #include <linux/sched/clock.h> #include <linux/sched/rt.h> -#include <linux/fsnotify.h> #include <linux/irq_work.h> #include <linux/workqueue.h> #include <linux/sort.h> @@ -219,14 +218,36 @@ static void ftrace_trace_userstack(struct trace_array *tr, static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; -static bool allocate_snapshot; -static bool snapshot_at_boot; - static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; static int boot_instance_index; -static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; -static int boot_snapshot_index; +/* + * Repeated boot parameters, including Bootconfig array expansions, need + * to stay in the delimiter form that the existing parser consumes. + */ +void __init trace_append_boot_param(char *buf, const char *str, char sep, + int size) +{ + int len, needed, str_len; + + if (!*str) + return; + + len = strlen(buf); + str_len = strlen(str); + needed = len + str_len + 1; + + /* For continuation, account for the separator. */ + if (len) + needed++; + if (needed > size) + return; + + if (len) + buf[len++] = sep; + + strscpy(buf + len, str, size - len); +} static int __init set_cmdline_ftrace(char *str) { @@ -276,38 +297,6 @@ static int __init stop_trace_on_warning(char *str) } __setup("traceoff_on_warning", stop_trace_on_warning); -static int __init boot_alloc_snapshot(char *str) -{ - char *slot = boot_snapshot_info + boot_snapshot_index; - int left = sizeof(boot_snapshot_info) - boot_snapshot_index; - int ret; - - if (str[0] == '=') { - str++; - if (strlen(str) >= left) - return -1; - - ret = snprintf(slot, left, "%s\t", str); - boot_snapshot_index += ret; - } else { - allocate_snapshot = true; - /* We also need the main ring buffer expanded */ - trace_set_ring_buffer_expanded(NULL); - } - return 1; -} -__setup("alloc_snapshot", boot_alloc_snapshot); - - -static int __init boot_snapshot(char *str) -{ - snapshot_at_boot = true; - boot_alloc_snapshot(str); - return 1; -} -__setup("ftrace_boot_snapshot", boot_snapshot); - - static int __init boot_instance(char *str) { char *slot = boot_instance_info + boot_instance_index; @@ -329,7 +318,8 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { - strscpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + trace_append_boot_param(trace_boot_options_buf, str, ',', + MAX_TRACER_SIZE); return 1; } __setup("trace_options=", set_trace_boot_options); @@ -555,7 +545,7 @@ static bool update_marker_trace(struct trace_array *tr, int enabled) lockdep_assert_held(&event_mutex); if (enabled) { - if (!list_empty(&tr->marker_list)) + if (tr->trace_flags & TRACE_ITER(COPY_MARKER)) return false; list_add_rcu(&tr->marker_list, &marker_copies); @@ -563,10 +553,10 @@ static bool update_marker_trace(struct trace_array *tr, int enabled) return true; } - if (list_empty(&tr->marker_list)) + if (!(tr->trace_flags & TRACE_ITER(COPY_MARKER))) return false; - list_del_init(&tr->marker_list); + list_del_rcu(&tr->marker_list); tr->trace_flags &= ~TRACE_ITER(COPY_MARKER); return true; } @@ -578,8 +568,59 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr) tr->ring_buffer_expanded = true; } +static void trace_array_autoremove(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, autoremove_work); + + trace_array_destroy(tr); +} + +static struct workqueue_struct *autoremove_wq; + +static void trace_array_kick_autoremove(struct trace_array *tr) +{ + if (autoremove_wq) + queue_work(autoremove_wq, &tr->autoremove_work); +} + +static void trace_array_cancel_autoremove(struct trace_array *tr) +{ + /* + * Since this can be called inside trace_array_autoremove(), + * it has to avoid deadlock of the workqueue. + */ + if (work_pending(&tr->autoremove_work)) + cancel_work_sync(&tr->autoremove_work); +} + +static void trace_array_init_autoremove(struct trace_array *tr) +{ + INIT_WORK(&tr->autoremove_work, trace_array_autoremove); +} + +static void trace_array_start_autoremove(void) +{ + if (autoremove_wq) + return; + + autoremove_wq = alloc_workqueue("tr_autoremove_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!autoremove_wq) + pr_warn("Unable to allocate tr_autoremove_wq. autoremove disabled.\n"); +} + LIST_HEAD(ftrace_trace_arrays); +static int __trace_array_get(struct trace_array *this_tr) +{ + /* When free_on_close is set, this is not available anymore. */ + if (autoremove_wq && this_tr->free_on_close) + return -ENODEV; + + this_tr->ref++; + return 0; +} + int trace_array_get(struct trace_array *this_tr) { struct trace_array *tr; @@ -587,8 +628,7 @@ int trace_array_get(struct trace_array *this_tr) guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) { - tr->ref++; - return 0; + return __trace_array_get(tr); } } @@ -599,6 +639,12 @@ static void __trace_array_put(struct trace_array *this_tr) { WARN_ON(!this_tr->ref); this_tr->ref--; + /* + * When free_on_close is set, prepare removing the array + * when the last reference is released. + */ + if (this_tr->ref == 1 && this_tr->free_on_close) + trace_array_kick_autoremove(this_tr); } /** @@ -807,47 +853,6 @@ void tracing_on(void) EXPORT_SYMBOL_GPL(tracing_on); #ifdef CONFIG_TRACER_SNAPSHOT -static void tracing_snapshot_instance_cond(struct trace_array *tr, - void *cond_data) -{ - unsigned long flags; - - if (in_nmi()) { - trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); - trace_array_puts(tr, "*** snapshot is being ignored ***\n"); - return; - } - - if (!tr->allocated_snapshot) { - trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); - trace_array_puts(tr, "*** stopping trace here! ***\n"); - tracer_tracing_off(tr); - return; - } - - if (tr->mapped) { - trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); - trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); - return; - } - - /* Note, snapshot can not be used when the tracer uses it */ - if (tracer_uses_snapshot(tr->current_trace)) { - trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); - trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); - return; - } - - local_irq_save(flags); - update_max_tr(tr, current, smp_processor_id(), cond_data); - local_irq_restore(flags); -} - -void tracing_snapshot_instance(struct trace_array *tr) -{ - tracing_snapshot_instance_cond(tr, NULL); -} - /** * tracing_snapshot - take a snapshot of the current buffer. * @@ -871,138 +876,6 @@ void tracing_snapshot(void) EXPORT_SYMBOL_GPL(tracing_snapshot); /** - * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. - * @tr: The tracing instance to snapshot - * @cond_data: The data to be tested conditionally, and possibly saved - * - * This is the same as tracing_snapshot() except that the snapshot is - * conditional - the snapshot will only happen if the - * cond_snapshot.update() implementation receiving the cond_data - * returns true, which means that the trace array's cond_snapshot - * update() operation used the cond_data to determine whether the - * snapshot should be taken, and if it was, presumably saved it along - * with the snapshot. - */ -void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) -{ - tracing_snapshot_instance_cond(tr, cond_data); -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond); - -/** - * tracing_cond_snapshot_data - get the user data associated with a snapshot - * @tr: The tracing instance - * - * When the user enables a conditional snapshot using - * tracing_snapshot_cond_enable(), the user-defined cond_data is saved - * with the snapshot. This accessor is used to retrieve it. - * - * Should not be called from cond_snapshot.update(), since it takes - * the tr->max_lock lock, which the code calling - * cond_snapshot.update() has already done. - * - * Returns the cond_data associated with the trace array's snapshot. - */ -void *tracing_cond_snapshot_data(struct trace_array *tr) -{ - void *cond_data = NULL; - - local_irq_disable(); - arch_spin_lock(&tr->max_lock); - - if (tr->cond_snapshot) - cond_data = tr->cond_snapshot->cond_data; - - arch_spin_unlock(&tr->max_lock); - local_irq_enable(); - - return cond_data; -} -EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); - -static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, - struct array_buffer *size_buf, int cpu_id); -static void set_buffer_entries(struct array_buffer *buf, unsigned long val); - -int tracing_alloc_snapshot_instance(struct trace_array *tr) -{ - int order; - int ret; - - if (!tr->allocated_snapshot) { - - /* Make the snapshot buffer have the same order as main buffer */ - order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); - ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order); - if (ret < 0) - return ret; - - /* allocate spare buffer */ - ret = resize_buffer_duplicate_size(&tr->snapshot_buffer, - &tr->array_buffer, RING_BUFFER_ALL_CPUS); - if (ret < 0) - return ret; - - tr->allocated_snapshot = true; - } - - return 0; -} - -static void free_snapshot(struct trace_array *tr) -{ - /* - * We don't free the ring buffer. instead, resize it because - * The max_tr ring buffer has some state (e.g. ring->clock) and - * we want preserve it. - */ - ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, 0); - ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); - set_buffer_entries(&tr->snapshot_buffer, 1); - tracing_reset_online_cpus(&tr->snapshot_buffer); - tr->allocated_snapshot = false; -} - -static int tracing_arm_snapshot_locked(struct trace_array *tr) -{ - int ret; - - lockdep_assert_held(&trace_types_lock); - - spin_lock(&tr->snapshot_trigger_lock); - if (tr->snapshot == UINT_MAX || tr->mapped) { - spin_unlock(&tr->snapshot_trigger_lock); - return -EBUSY; - } - - tr->snapshot++; - spin_unlock(&tr->snapshot_trigger_lock); - - ret = tracing_alloc_snapshot_instance(tr); - if (ret) { - spin_lock(&tr->snapshot_trigger_lock); - tr->snapshot--; - spin_unlock(&tr->snapshot_trigger_lock); - } - - return ret; -} - -int tracing_arm_snapshot(struct trace_array *tr) -{ - guard(mutex)(&trace_types_lock); - return tracing_arm_snapshot_locked(tr); -} - -void tracing_disarm_snapshot(struct trace_array *tr) -{ - spin_lock(&tr->snapshot_trigger_lock); - if (!WARN_ON(!tr->snapshot)) - tr->snapshot--; - spin_unlock(&tr->snapshot_trigger_lock); -} - -/** * tracing_alloc_snapshot - allocate snapshot buffer. * * This only allocates the snapshot buffer if it isn't already @@ -1022,159 +895,18 @@ int tracing_alloc_snapshot(void) return ret; } -EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); - -/** - * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. - * - * This is similar to tracing_snapshot(), but it will allocate the - * snapshot buffer if it isn't already allocated. Use this only - * where it is safe to sleep, as the allocation may sleep. - * - * This causes a swap between the snapshot buffer and the current live - * tracing buffer. You can use this to take snapshots of the live - * trace when some condition is triggered, but continue to trace. - */ -void tracing_snapshot_alloc(void) -{ - int ret; - - ret = tracing_alloc_snapshot(); - if (ret < 0) - return; - - tracing_snapshot(); -} -EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); - -/** - * tracing_snapshot_cond_enable - enable conditional snapshot for an instance - * @tr: The tracing instance - * @cond_data: User data to associate with the snapshot - * @update: Implementation of the cond_snapshot update function - * - * Check whether the conditional snapshot for the given instance has - * already been enabled, or if the current tracer is already using a - * snapshot; if so, return -EBUSY, else create a cond_snapshot and - * save the cond_data and update function inside. - * - * Returns 0 if successful, error otherwise. - */ -int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, - cond_update_fn_t update) -{ - struct cond_snapshot *cond_snapshot __free(kfree) = - kzalloc_obj(*cond_snapshot); - int ret; - - if (!cond_snapshot) - return -ENOMEM; - - cond_snapshot->cond_data = cond_data; - cond_snapshot->update = update; - - guard(mutex)(&trace_types_lock); - - if (tracer_uses_snapshot(tr->current_trace)) - return -EBUSY; - - /* - * The cond_snapshot can only change to NULL without the - * trace_types_lock. We don't care if we race with it going - * to NULL, but we want to make sure that it's not set to - * something other than NULL when we get here, which we can - * do safely with only holding the trace_types_lock and not - * having to take the max_lock. - */ - if (tr->cond_snapshot) - return -EBUSY; - - ret = tracing_arm_snapshot_locked(tr); - if (ret) - return ret; - - local_irq_disable(); - arch_spin_lock(&tr->max_lock); - tr->cond_snapshot = no_free_ptr(cond_snapshot); - arch_spin_unlock(&tr->max_lock); - local_irq_enable(); - - return 0; -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); - -/** - * tracing_snapshot_cond_disable - disable conditional snapshot for an instance - * @tr: The tracing instance - * - * Check whether the conditional snapshot for the given instance is - * enabled; if so, free the cond_snapshot associated with it, - * otherwise return -EINVAL. - * - * Returns 0 if successful, error otherwise. - */ -int tracing_snapshot_cond_disable(struct trace_array *tr) -{ - int ret = 0; - - local_irq_disable(); - arch_spin_lock(&tr->max_lock); - - if (!tr->cond_snapshot) - ret = -EINVAL; - else { - kfree(tr->cond_snapshot); - tr->cond_snapshot = NULL; - } - - arch_spin_unlock(&tr->max_lock); - local_irq_enable(); - - tracing_disarm_snapshot(tr); - - return ret; -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); -void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) -{ - WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond); -int tracing_alloc_snapshot(void) -{ - WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); - return -ENODEV; -} -EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); void tracing_snapshot_alloc(void) { /* Give warning */ tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); -void *tracing_cond_snapshot_data(struct trace_array *tr) -{ - return NULL; -} -EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); -int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) -{ - return -ENODEV; -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); -int tracing_snapshot_cond_disable(struct trace_array *tr) -{ - return false; -} -EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); -#define free_snapshot(tr) do { } while (0) -#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; }) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1487,206 +1219,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) unsigned long __read_mostly tracing_thresh; -#ifdef CONFIG_TRACER_MAX_TRACE -#ifdef LATENCY_FS_NOTIFY -static struct workqueue_struct *fsnotify_wq; - -static void latency_fsnotify_workfn(struct work_struct *work) -{ - struct trace_array *tr = container_of(work, struct trace_array, - fsnotify_work); - fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); -} - -static void latency_fsnotify_workfn_irq(struct irq_work *iwork) -{ - struct trace_array *tr = container_of(iwork, struct trace_array, - fsnotify_irqwork); - queue_work(fsnotify_wq, &tr->fsnotify_work); -} - -__init static int latency_fsnotify_init(void) -{ - fsnotify_wq = alloc_workqueue("tr_max_lat_wq", - WQ_UNBOUND | WQ_HIGHPRI, 0); - if (!fsnotify_wq) { - pr_err("Unable to allocate tr_max_lat_wq\n"); - return -ENOMEM; - } - return 0; -} - -late_initcall_sync(latency_fsnotify_init); - -void latency_fsnotify(struct trace_array *tr) -{ - if (!fsnotify_wq) - return; - /* - * We cannot call queue_work(&tr->fsnotify_work) from here because it's - * possible that we are called from __schedule() or do_idle(), which - * could cause a deadlock. - */ - irq_work_queue(&tr->fsnotify_irqwork); -} -#endif /* !LATENCY_FS_NOTIFY */ - -static const struct file_operations tracing_max_lat_fops; - -static void trace_create_maxlat_file(struct trace_array *tr, - struct dentry *d_tracer) -{ -#ifdef LATENCY_FS_NOTIFY - INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); - init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); -#endif - tr->d_max_latency = trace_create_file("tracing_max_latency", - TRACE_MODE_WRITE, - d_tracer, tr, - &tracing_max_lat_fops); -} - -/* - * Copy the new maximum trace into the separate maximum-trace - * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/tracing/tracing_max_latency) - */ -static void -__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - struct array_buffer *trace_buf = &tr->array_buffer; - struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); - struct array_buffer *max_buf = &tr->snapshot_buffer; - struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); - - max_buf->cpu = cpu; - max_buf->time_start = data->preempt_timestamp; - - max_data->saved_latency = tr->max_latency; - max_data->critical_start = data->critical_start; - max_data->critical_end = data->critical_end; - - strscpy(max_data->comm, tsk->comm); - max_data->pid = tsk->pid; - /* - * If tsk == current, then use current_uid(), as that does not use - * RCU. The irq tracer can be called out of RCU scope. - */ - if (tsk == current) - max_data->uid = current_uid(); - else - max_data->uid = task_uid(tsk); - - max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; - max_data->policy = tsk->policy; - max_data->rt_priority = tsk->rt_priority; - - /* record this tasks comm */ - tracing_record_cmdline(tsk); - latency_fsnotify(tr); -} -#else -static inline void trace_create_maxlat_file(struct trace_array *tr, - struct dentry *d_tracer) { } -static inline void __update_max_tr(struct trace_array *tr, - struct task_struct *tsk, int cpu) { } -#endif /* CONFIG_TRACER_MAX_TRACE */ - -#ifdef CONFIG_TRACER_SNAPSHOT -/** - * update_max_tr - snapshot all trace buffers from global_trace to max_tr - * @tr: tracer - * @tsk: the task with the latency - * @cpu: The cpu that initiated the trace. - * @cond_data: User data associated with a conditional snapshot - * - * Flip the buffers between the @tr and the max_tr and record information - * about which task was the cause of this latency. - */ -void -update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, - void *cond_data) -{ - if (tr->stop_count) - return; - - WARN_ON_ONCE(!irqs_disabled()); - - if (!tr->allocated_snapshot) { - /* Only the nop tracer should hit this when disabling */ - WARN_ON_ONCE(tr->current_trace != &nop_trace); - return; - } - - arch_spin_lock(&tr->max_lock); - - /* Inherit the recordable setting from array_buffer */ - if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) - ring_buffer_record_on(tr->snapshot_buffer.buffer); - else - ring_buffer_record_off(tr->snapshot_buffer.buffer); - - if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { - arch_spin_unlock(&tr->max_lock); - return; - } - - swap(tr->array_buffer.buffer, tr->snapshot_buffer.buffer); - - __update_max_tr(tr, tsk, cpu); - - arch_spin_unlock(&tr->max_lock); - - /* Any waiters on the old snapshot buffer need to wake up */ - ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); -} - -/** - * update_max_tr_single - only copy one trace over, and reset the rest - * @tr: tracer - * @tsk: task with the latency - * @cpu: the cpu of the buffer to copy. - * - * Flip the trace of a single CPU buffer between the @tr and the max_tr. - */ -void -update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) -{ - int ret; - - if (tr->stop_count) - return; - - WARN_ON_ONCE(!irqs_disabled()); - if (!tr->allocated_snapshot) { - /* Only the nop tracer should hit this when disabling */ - WARN_ON_ONCE(tr->current_trace != &nop_trace); - return; - } - - arch_spin_lock(&tr->max_lock); - - ret = ring_buffer_swap_cpu(tr->snapshot_buffer.buffer, tr->array_buffer.buffer, cpu); - - if (ret == -EBUSY) { - /* - * We failed to swap the buffer due to a commit taking - * place on this CPU. We fail to record, but we reset - * the max trace buffer (no one writes directly to it) - * and flag that it failed. - * Another reason is resize is in progress. - */ - trace_array_printk_buf(tr->snapshot_buffer.buffer, _THIS_IP_, - "Failed to swap buffers due to commit or resize in progress\n"); - } - - WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); - - __update_max_tr(tr, tsk, cpu); - arch_spin_unlock(&tr->max_lock); -} -#endif /* CONFIG_TRACER_SNAPSHOT */ - struct pipe_wait { struct trace_iterator *iter; int wait_index; @@ -1995,7 +1527,7 @@ int __init register_tracer(struct tracer *type) return 0; } -static void tracing_reset_cpu(struct array_buffer *buf, int cpu) +void tracing_reset_cpu(struct array_buffer *buf, int cpu) { struct trace_buffer *buffer = buf->buffer; @@ -3760,50 +3292,6 @@ static void test_ftrace_alive(struct seq_file *m) "# MAY BE MISSING FUNCTION EVENTS\n"); } -#ifdef CONFIG_TRACER_SNAPSHOT -static void show_snapshot_main_help(struct seq_file *m) -{ - seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" - "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" - "# Takes a snapshot of the main buffer.\n" - "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" - "# (Doesn't have to be '2' works with any number that\n" - "# is not a '0' or '1')\n"); -} - -static void show_snapshot_percpu_help(struct seq_file *m) -{ - seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); -#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" - "# Takes a snapshot of the main buffer for this cpu.\n"); -#else - seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" - "# Must use main snapshot file to allocate.\n"); -#endif - seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" - "# (Doesn't have to be '2' works with any number that\n" - "# is not a '0' or '1')\n"); -} - -static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) -{ - if (iter->tr->allocated_snapshot) - seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); - else - seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); - - seq_puts(m, "# Snapshot commands:\n"); - if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - show_snapshot_main_help(m); - else - show_snapshot_percpu_help(m); -} -#else -/* Should never be called */ -static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } -#endif - static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -3852,17 +3340,6 @@ static int s_show(struct seq_file *m, void *v) return 0; } -/* - * Should be used after trace_array_get(), trace_types_lock - * ensures that i_cdev was already initialized. - */ -static inline int tracing_get_cpu(struct inode *inode) -{ - if (inode->i_cdev) /* See trace_create_cpu_file() */ - return (long)inode->i_cdev - 1; - return RING_BUFFER_ALL_CPUS; -} - static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, @@ -3889,7 +3366,7 @@ static void free_trace_iter_content(struct trace_iterator *iter) free_cpumask_var(iter->started); } -static struct trace_iterator * +struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { struct trace_array *tr = inode->i_private; @@ -4022,6 +3499,11 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) if (ret) return ret; + if ((filp->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) { + trace_array_put(tr); + return -EACCES; + } + filp->private_data = inode->i_private; return 0; @@ -4050,8 +3532,6 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp) event_file_get(file); } - filp->private_data = inode->i_private; - return 0; } @@ -4071,7 +3551,7 @@ int tracing_single_release_file_tr(struct inode *inode, struct file *filp) return single_release(inode, filp); } -static int tracing_release(struct inode *inode, struct file *file) +int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct seq_file *m = file->private_data; @@ -5222,7 +4702,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr) return t->init(tr); } -static void set_buffer_entries(struct array_buffer *buf, unsigned long val) +void trace_set_buffer_entries(struct array_buffer *buf, unsigned long val) { int cpu; @@ -5233,40 +4713,12 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val) static void update_buffer_entries(struct array_buffer *buf, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { - set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); + trace_set_buffer_entries(buf, ring_buffer_size(buf->buffer, 0)); } else { per_cpu_ptr(buf->data, cpu)->entries = ring_buffer_size(buf->buffer, cpu); } } -#ifdef CONFIG_TRACER_SNAPSHOT -/* resize @tr's buffer to the size of @size_tr's entries */ -static int resize_buffer_duplicate_size(struct array_buffer *trace_buf, - struct array_buffer *size_buf, int cpu_id) -{ - int cpu, ret = 0; - - if (cpu_id == RING_BUFFER_ALL_CPUS) { - for_each_tracing_cpu(cpu) { - ret = ring_buffer_resize(trace_buf->buffer, - per_cpu_ptr(size_buf->data, cpu)->entries, cpu); - if (ret < 0) - break; - per_cpu_ptr(trace_buf->data, cpu)->entries = - per_cpu_ptr(size_buf->data, cpu)->entries; - } - } else { - ret = ring_buffer_resize(trace_buf->buffer, - per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); - if (ret == 0) - per_cpu_ptr(trace_buf->data, cpu_id)->entries = - per_cpu_ptr(size_buf->data, cpu_id)->entries; - } - - return ret; -} -#endif /* CONFIG_TRACER_SNAPSHOT */ - static int __tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu) { @@ -5462,6 +4914,10 @@ static void update_last_data(struct trace_array *tr) /* Only if the buffer has previous boot data clear and update it. */ tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT; + /* If this is a backup instance, mark it for autoremove. */ + if (tr->flags & TRACE_ARRAY_FL_VMALLOC) + tr->free_on_close = true; + /* Reset the module list and reload them */ if (tr->scratch) { struct trace_scratch *tscratch = tr->scratch; @@ -5685,9 +5141,8 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, return ret; } -static ssize_t -tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, - size_t cnt, loff_t *ppos) +ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, + size_t cnt, loff_t *ppos) { char buf[64]; int r; @@ -5699,9 +5154,8 @@ tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static ssize_t -tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, - size_t cnt, loff_t *ppos) +ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, + size_t cnt, loff_t *ppos) { unsigned long val; int ret; @@ -5743,28 +5197,6 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf, return cnt; } -#ifdef CONFIG_TRACER_MAX_TRACE - -static ssize_t -tracing_max_lat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - - return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); -} - -static ssize_t -tracing_max_lat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - struct trace_array *tr = filp->private_data; - - return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); -} - -#endif - static int open_pipe_on_cpu(struct trace_array *tr, int cpu) { if (cpu == RING_BUFFER_ALL_CPUS) { @@ -6784,6 +6216,23 @@ char *trace_user_fault_read(struct trace_user_buf_info *tinfo, do { /* + * It is possible that something is trying to migrate this + * task. What happens then, is when preemption is enabled, + * the migration thread will preempt this task, try to + * migrate it, fail, then let it run again. That will + * cause this to loop again and never succeed. + * On failures, enabled and disable preemption with + * migration enabled, to allow the migration thread to + * migrate this task. + */ + if (trys) { + preempt_enable_notrace(); + preempt_disable_notrace(); + cpu = smp_processor_id(); + buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf; + } + + /* * If for some reason, copy_from_user() always causes a context * switch, this would then cause an infinite loop. * If this task is preempted by another user space task, it @@ -7080,6 +6529,11 @@ static int tracing_clock_open(struct inode *inode, struct file *file) if (ret) return ret; + if ((file->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) { + trace_array_put(tr); + return -EACCES; + } + ret = single_open(file, tracing_clock_show, inode->i_private); if (ret < 0) trace_array_put(tr); @@ -7125,194 +6579,6 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve return ring_buffer_event_time_stamp(buffer, rbe); } -struct ftrace_buffer_info { - struct trace_iterator iter; - void *spare; - unsigned int spare_cpu; - unsigned int spare_size; - unsigned int read; -}; - -#ifdef CONFIG_TRACER_SNAPSHOT -static int tracing_snapshot_open(struct inode *inode, struct file *file) -{ - struct trace_array *tr = inode->i_private; - struct trace_iterator *iter; - struct seq_file *m; - int ret; - - ret = tracing_check_open_get_tr(tr); - if (ret) - return ret; - - if (file->f_mode & FMODE_READ) { - iter = __tracing_open(inode, file, true); - if (IS_ERR(iter)) - ret = PTR_ERR(iter); - } else { - /* Writes still need the seq_file to hold the private data */ - ret = -ENOMEM; - m = kzalloc_obj(*m); - if (!m) - goto out; - iter = kzalloc_obj(*iter); - if (!iter) { - kfree(m); - goto out; - } - ret = 0; - - iter->tr = tr; - iter->array_buffer = &tr->snapshot_buffer; - iter->cpu_file = tracing_get_cpu(inode); - m->private = iter; - file->private_data = m; - } -out: - if (ret < 0) - trace_array_put(tr); - - return ret; -} - -static void tracing_swap_cpu_buffer(void *tr) -{ - update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); -} - -static ssize_t -tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, - loff_t *ppos) -{ - struct seq_file *m = filp->private_data; - struct trace_iterator *iter = m->private; - struct trace_array *tr = iter->tr; - unsigned long val; - int ret; - - ret = tracing_update_buffers(tr); - if (ret < 0) - return ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - guard(mutex)(&trace_types_lock); - - if (tracer_uses_snapshot(tr->current_trace)) - return -EBUSY; - - local_irq_disable(); - arch_spin_lock(&tr->max_lock); - if (tr->cond_snapshot) - ret = -EBUSY; - arch_spin_unlock(&tr->max_lock); - local_irq_enable(); - if (ret) - return ret; - - switch (val) { - case 0: - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) - return -EINVAL; - if (tr->allocated_snapshot) - free_snapshot(tr); - break; - case 1: -/* Only allow per-cpu swap if the ring buffer supports it */ -#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) - return -EINVAL; -#endif - if (tr->allocated_snapshot) - ret = resize_buffer_duplicate_size(&tr->snapshot_buffer, - &tr->array_buffer, iter->cpu_file); - - ret = tracing_arm_snapshot_locked(tr); - if (ret) - return ret; - - /* Now, we're going to swap */ - if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { - local_irq_disable(); - update_max_tr(tr, current, smp_processor_id(), NULL); - local_irq_enable(); - } else { - smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, - (void *)tr, 1); - } - tracing_disarm_snapshot(tr); - break; - default: - if (tr->allocated_snapshot) { - if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->snapshot_buffer); - else - tracing_reset_cpu(&tr->snapshot_buffer, iter->cpu_file); - } - break; - } - - if (ret >= 0) { - *ppos += cnt; - ret = cnt; - } - - return ret; -} - -static int tracing_snapshot_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = file->private_data; - int ret; - - ret = tracing_release(inode, file); - - if (file->f_mode & FMODE_READ) - return ret; - - /* If write only, the seq_file is just a stub */ - if (m) - kfree(m->private); - kfree(m); - - return 0; -} - -static int tracing_buffers_open(struct inode *inode, struct file *filp); -static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos); -static int tracing_buffers_release(struct inode *inode, struct file *file); -static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, unsigned int flags); - -static int snapshot_raw_open(struct inode *inode, struct file *filp) -{ - struct ftrace_buffer_info *info; - int ret; - - /* The following checks for tracefs lockdown */ - ret = tracing_buffers_open(inode, filp); - if (ret < 0) - return ret; - - info = filp->private_data; - - if (tracer_uses_snapshot(info->iter.trace)) { - tracing_buffers_release(inode, filp); - return -EBUSY; - } - - info->iter.snapshot = true; - info->iter.array_buffer = &info->iter.tr->snapshot_buffer; - - return ret; -} - -#endif /* CONFIG_TRACER_SNAPSHOT */ - - static const struct file_operations tracing_thresh_fops = { .open = tracing_open_generic, .read = tracing_thresh_read, @@ -7320,16 +6586,6 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; -#ifdef CONFIG_TRACER_MAX_TRACE -static const struct file_operations tracing_max_lat_fops = { - .open = tracing_open_generic_tr, - .read = tracing_max_lat_read, - .write = tracing_max_lat_write, - .llseek = generic_file_llseek, - .release = tracing_release_generic_tr, -}; -#endif - static const struct file_operations set_tracer_fops = { .open = tracing_open_generic_tr, .read = tracing_set_trace_read, @@ -7416,24 +6672,6 @@ static const struct file_operations last_boot_fops = { .release = tracing_seq_release, }; -#ifdef CONFIG_TRACER_SNAPSHOT -static const struct file_operations snapshot_fops = { - .open = tracing_snapshot_open, - .read = seq_read, - .write = tracing_snapshot_write, - .llseek = tracing_lseek, - .release = tracing_snapshot_release, -}; - -static const struct file_operations snapshot_raw_fops = { - .open = snapshot_raw_open, - .read = tracing_buffers_read, - .release = tracing_buffers_release, - .splice_read = tracing_buffers_splice_read, -}; - -#endif /* CONFIG_TRACER_SNAPSHOT */ - /* * trace_min_max_write - Write a u64 value to a trace_min_max_param struct * @filp: The active open file structure @@ -7793,7 +7031,7 @@ static const struct file_operations tracing_err_log_fops = { .release = tracing_err_log_release, }; -static int tracing_buffers_open(struct inode *inode, struct file *filp) +int tracing_buffers_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct ftrace_buffer_info *info; @@ -7841,9 +7079,8 @@ tracing_buffers_poll(struct file *filp, poll_table *poll_table) return trace_poll(iter, filp, poll_table); } -static ssize_t -tracing_buffers_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) +ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; @@ -7944,7 +7181,7 @@ static int tracing_buffers_flush(struct file *file, fl_owner_t id) return 0; } -static int tracing_buffers_release(struct inode *inode, struct file *file) +int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; @@ -8018,10 +7255,9 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) spd->partial[i].private = 0; } -static ssize_t -tracing_buffers_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; @@ -8175,43 +7411,17 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned return 0; } -#ifdef CONFIG_TRACER_SNAPSHOT -static int get_snapshot_map(struct trace_array *tr) +/* + * This is called when a VMA is duplicated (e.g., on fork()) to increment + * the user_mapped counter without remapping pages. + */ +static void tracing_buffers_mmap_open(struct vm_area_struct *vma) { - int err = 0; - - /* - * Called with mmap_lock held. lockdep would be unhappy if we would now - * take trace_types_lock. Instead use the specific - * snapshot_trigger_lock. - */ - spin_lock(&tr->snapshot_trigger_lock); - - if (tr->snapshot || tr->mapped == UINT_MAX) - err = -EBUSY; - else - tr->mapped++; - - spin_unlock(&tr->snapshot_trigger_lock); - - /* Wait for update_max_tr() to observe iter->tr->mapped */ - if (tr->mapped == 1) - synchronize_rcu(); - - return err; + struct ftrace_buffer_info *info = vma->vm_file->private_data; + struct trace_iterator *iter = &info->iter; + ring_buffer_map_dup(iter->array_buffer->buffer, iter->cpu_file); } -static void put_snapshot_map(struct trace_array *tr) -{ - spin_lock(&tr->snapshot_trigger_lock); - if (!WARN_ON(!tr->mapped)) - tr->mapped--; - spin_unlock(&tr->snapshot_trigger_lock); -} -#else -static inline int get_snapshot_map(struct trace_array *tr) { return 0; } -static inline void put_snapshot_map(struct trace_array *tr) { } -#endif static void tracing_buffers_mmap_close(struct vm_area_struct *vma) { @@ -8232,6 +7442,7 @@ static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long a } static const struct vm_operations_struct tracing_buffers_vmops = { + .open = tracing_buffers_mmap_open, .close = tracing_buffers_mmap_close, .may_split = tracing_buffers_may_split, }; @@ -8380,170 +7591,6 @@ static const struct file_operations tracing_dyn_info_fops = { }; #endif /* CONFIG_DYNAMIC_FTRACE */ -#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) -static void -ftrace_snapshot(unsigned long ip, unsigned long parent_ip, - struct trace_array *tr, struct ftrace_probe_ops *ops, - void *data) -{ - tracing_snapshot_instance(tr); -} - -static void -ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, - struct trace_array *tr, struct ftrace_probe_ops *ops, - void *data) -{ - struct ftrace_func_mapper *mapper = data; - long *count = NULL; - - if (mapper) - count = (long *)ftrace_func_mapper_find_ip(mapper, ip); - - if (count) { - - if (*count <= 0) - return; - - (*count)--; - } - - tracing_snapshot_instance(tr); -} - -static int -ftrace_snapshot_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data) -{ - struct ftrace_func_mapper *mapper = data; - long *count = NULL; - - seq_printf(m, "%ps:", (void *)ip); - - seq_puts(m, "snapshot"); - - if (mapper) - count = (long *)ftrace_func_mapper_find_ip(mapper, ip); - - if (count) - seq_printf(m, ":count=%ld\n", *count); - else - seq_puts(m, ":unlimited\n"); - - return 0; -} - -static int -ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void *init_data, void **data) -{ - struct ftrace_func_mapper *mapper = *data; - - if (!mapper) { - mapper = allocate_ftrace_func_mapper(); - if (!mapper) - return -ENOMEM; - *data = mapper; - } - - return ftrace_func_mapper_add_ip(mapper, ip, init_data); -} - -static void -ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, - unsigned long ip, void *data) -{ - struct ftrace_func_mapper *mapper = data; - - if (!ip) { - if (!mapper) - return; - free_ftrace_func_mapper(mapper, NULL); - return; - } - - ftrace_func_mapper_remove_ip(mapper, ip); -} - -static struct ftrace_probe_ops snapshot_probe_ops = { - .func = ftrace_snapshot, - .print = ftrace_snapshot_print, -}; - -static struct ftrace_probe_ops snapshot_count_probe_ops = { - .func = ftrace_count_snapshot, - .print = ftrace_snapshot_print, - .init = ftrace_snapshot_init, - .free = ftrace_snapshot_free, -}; - -static int -ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, - char *glob, char *cmd, char *param, int enable) -{ - struct ftrace_probe_ops *ops; - void *count = (void *)-1; - char *number; - int ret; - - if (!tr) - return -ENODEV; - - /* hash funcs only work with set_ftrace_filter */ - if (!enable) - return -EINVAL; - - ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; - - if (glob[0] == '!') { - ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); - if (!ret) - tracing_disarm_snapshot(tr); - - return ret; - } - - if (!param) - goto out_reg; - - number = strsep(¶m, ":"); - - if (!strlen(number)) - goto out_reg; - - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, (unsigned long *)&count); - if (ret) - return ret; - - out_reg: - ret = tracing_arm_snapshot(tr); - if (ret < 0) - return ret; - - ret = register_ftrace_function_probe(glob, tr, ops, count); - if (ret < 0) - tracing_disarm_snapshot(tr); - - return ret < 0 ? ret : 0; -} - -static struct ftrace_func_command ftrace_snapshot_cmd = { - .name = "snapshot", - .func = ftrace_trace_snapshot_callback, -}; - -static __init int register_snapshot_cmd(void) -{ - return register_ftrace_command(&ftrace_snapshot_cmd); -} -#else -static inline __init int register_snapshot_cmd(void) { return 0; } -#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ - static struct dentry *tracing_get_dentry(struct trace_array *tr) { /* Top directory uses NULL as the parent */ @@ -8576,7 +7623,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) return tr->percpu_dir; } -static struct dentry * +struct dentry * trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, void *data, long cpu, const struct file_operations *fops) { @@ -9336,8 +8383,7 @@ static void setup_trace_scratch(struct trace_array *tr, memset(tscratch, 0, size); } -static int -allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) +int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size) { enum ring_buffer_flags rb_flags; struct trace_scratch *tscratch; @@ -9376,8 +8422,8 @@ allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size } /* Allocate the first page for all buffers */ - set_buffer_entries(&tr->array_buffer, - ring_buffer_size(tr->array_buffer.buffer, 0)); + trace_set_buffer_entries(&tr->array_buffer, + ring_buffer_size(tr->array_buffer.buffer, 0)); return 0; } @@ -9392,7 +8438,7 @@ static void free_trace_buffer(struct array_buffer *buf) } } -static int allocate_trace_buffers(struct trace_array *tr, int size) +static int allocate_trace_buffers(struct trace_array *tr, unsigned long size) { int ret; @@ -9400,23 +8446,11 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) if (ret) return ret; -#ifdef CONFIG_TRACER_SNAPSHOT - /* Fix mapped buffer trace arrays do not have snapshot buffers */ - if (tr->range_addr_start) - return 0; - - ret = allocate_trace_buffer(tr, &tr->snapshot_buffer, - allocate_snapshot ? size : 1); - if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) { + ret = trace_allocate_snapshot(tr, size); + if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) free_trace_buffer(&tr->array_buffer); - return -ENOMEM; - } - tr->allocated_snapshot = allocate_snapshot; - - allocate_snapshot = false; -#endif - return 0; + return ret; } static void free_trace_buffers(struct trace_array *tr) @@ -9497,8 +8531,8 @@ struct trace_array *trace_array_find_get(const char *instance) guard(mutex)(&trace_types_lock); tr = trace_array_find(instance); - if (tr) - tr->ref++; + if (tr && __trace_array_get(tr) < 0) + tr = NULL; return tr; } @@ -9595,6 +8629,8 @@ trace_array_create_systems(const char *name, const char *systems, if (ftrace_allocate_ftrace_ops(tr) < 0) goto out_free_tr; + trace_array_init_autoremove(tr); + ftrace_init_trace_array(tr); init_trace_flags_index(tr); @@ -9705,7 +8741,9 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr->name && strcmp(tr->name, name) == 0) { - tr->ref++; + /* if this fails, @tr is going to be removed. */ + if (__trace_array_get(tr) < 0) + tr = NULL; return tr; } } @@ -9731,18 +8769,20 @@ static int __remove_instance(struct trace_array *tr) list_del(&tr->list); - /* Disable all the flags that were enabled coming in */ - for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { - if ((1ULL << i) & ZEROED_TRACE_FLAGS) - set_tracer_flag(tr, 1ULL << i, 0); - } - if (printk_trace == tr) update_printk_trace(&global_trace); + /* Must be done before disabling all the flags */ if (update_marker_trace(tr, 0)) synchronize_rcu(); + /* Disable all the flags that were enabled coming in */ + for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) { + if ((1ULL << i) & ZEROED_TRACE_FLAGS) + set_tracer_flag(tr, 1ULL << i, 0); + } + + trace_array_cancel_autoremove(tr); tracing_set_nop(tr); clear_ftrace_function_probes(tr); event_trace_del_tracer(tr); @@ -9835,17 +8875,22 @@ static __init void create_trace_instances(struct dentry *d_tracer) static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { + umode_t writable_mode = TRACE_MODE_WRITE; int cpu; + if (trace_array_is_readonly(tr)) + writable_mode = TRACE_MODE_READ; + trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, - tr, &show_traces_fops); + tr, &show_traces_fops); - trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer, - tr, &set_tracer_fops); + trace_create_file("current_tracer", writable_mode, d_tracer, + tr, &set_tracer_fops); - trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer, + trace_create_file("tracing_cpumask", writable_mode, d_tracer, tr, &tracing_cpumask_fops); + /* Options are used for changing print-format even for readonly instance. */ trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer, tr, &tracing_iter_fops); @@ -9855,12 +8900,36 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer, tr, &tracing_pipe_fops); - trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer, + trace_create_file("buffer_size_kb", writable_mode, d_tracer, tr, &tracing_entries_fops); trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer, tr, &tracing_total_entries_fops); + trace_create_file("trace_clock", writable_mode, d_tracer, tr, + &trace_clock_fops); + + trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, + &trace_time_stamp_mode_fops); + + tr->buffer_percent = 50; + + trace_create_file("buffer_subbuf_size_kb", writable_mode, d_tracer, + tr, &buffer_subbuf_size_fops); + + create_trace_options_dir(tr); + + if (tr->range_addr_start) + trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer, + tr, &last_boot_fops); + + for_each_tracing_cpu(cpu) + tracing_init_tracefs_percpu(tr, cpu); + + /* Read-only instance has above files only. */ + if (trace_array_is_readonly(tr)) + return; + trace_create_file("free_buffer", 0200, d_tracer, tr, &tracing_free_buffer_fops); @@ -9872,49 +8941,29 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); - trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr, - &trace_clock_fops); - - trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, - tr, &rb_simple_fops); - - trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, - &trace_time_stamp_mode_fops); - - tr->buffer_percent = 50; - trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer, - tr, &buffer_percent_fops); - - trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer, - tr, &buffer_subbuf_size_fops); + tr, &buffer_percent_fops); trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer, - tr, &tracing_syscall_buf_fops); + tr, &tracing_syscall_buf_fops); - create_trace_options_dir(tr); + trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, + tr, &rb_simple_fops); trace_create_maxlat_file(tr, d_tracer); if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); - if (tr->range_addr_start) { - trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer, - tr, &last_boot_fops); #ifdef CONFIG_TRACER_SNAPSHOT - } else { + if (!tr->range_addr_start) trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, tr, &snapshot_fops); #endif - } trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, tr, &tracing_err_log_fops); - for_each_tracing_cpu(cpu) - tracing_init_tracefs_percpu(tr, cpu); - ftrace_init_tracefs(tr, d_tracer); } @@ -10523,47 +9572,6 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, return done; } -#ifdef CONFIG_TRACER_SNAPSHOT -__init static bool tr_needs_alloc_snapshot(const char *name) -{ - char *test; - int len = strlen(name); - bool ret; - - if (!boot_snapshot_index) - return false; - - if (strncmp(name, boot_snapshot_info, len) == 0 && - boot_snapshot_info[len] == '\t') - return true; - - test = kmalloc(strlen(name) + 3, GFP_KERNEL); - if (!test) - return false; - - sprintf(test, "\t%s\t", name); - ret = strstr(boot_snapshot_info, test) == NULL; - kfree(test); - return ret; -} - -__init static void do_allocate_snapshot(const char *name) -{ - if (!tr_needs_alloc_snapshot(name)) - return; - - /* - * When allocate_snapshot is set, the next call to - * allocate_trace_buffers() (called by trace_array_get_by_name()) - * will allocate the snapshot buffer. That will also clear - * this flag. - */ - allocate_snapshot = true; -} -#else -static inline void do_allocate_snapshot(const char *name) { } -#endif - __init static int backup_instance_area(const char *backup, unsigned long *addr, phys_addr_t *size) { @@ -10713,8 +9721,7 @@ __init static void enable_instances(void) } } else { /* Only non mapped buffers have snapshot buffers */ - if (IS_ENABLED(CONFIG_TRACER_SNAPSHOT)) - do_allocate_snapshot(name); + do_allocate_snapshot(name); } tr = trace_array_create_systems(name, NULL, addr, size); @@ -10740,23 +9747,47 @@ __init static void enable_instances(void) /* * Backup buffers can be freed but need vfree(). */ - if (backup) - tr->flags |= TRACE_ARRAY_FL_VMALLOC; + if (backup) { + tr->flags |= TRACE_ARRAY_FL_VMALLOC | TRACE_ARRAY_FL_RDONLY; + trace_array_start_autoremove(); + } if (start || backup) { tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT; tr->range_name = no_free_ptr(rname); } + /* + * Save the events to start and enabled them after all boot instances + * have been created. + */ + tr->boot_events = curr_str; + } + + /* Enable the events after all boot instances have been created */ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + + if (!tr->boot_events || !(*tr->boot_events)) { + tr->boot_events = NULL; + continue; + } + + curr_str = tr->boot_events; + + /* Clear the instance if this is a persistent buffer */ + if (tr->flags & TRACE_ARRAY_FL_LAST_BOOT) + update_last_data(tr); + while ((tok = strsep(&curr_str, ","))) { early_enable_events(tr, tok, true); } + tr->boot_events = NULL; } } __init static int tracer_alloc_buffers(void) { - int ring_buf_size; + unsigned long ring_buf_size; int ret = -ENOMEM; @@ -10906,24 +9937,6 @@ struct trace_array *trace_get_global_array(void) } #endif -void __init ftrace_boot_snapshot(void) -{ -#ifdef CONFIG_TRACER_SNAPSHOT - struct trace_array *tr; - - if (!snapshot_at_boot) - return; - - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (!tr->allocated_snapshot) - continue; - - tracing_snapshot_instance(tr); - trace_array_puts(tr, "** Boot snapshot taken **\n"); - } -#endif -} - void __init early_trace_init(void) { if (tracepoint_printk) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b8f3804586a0..80fe152af1dd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -264,6 +264,7 @@ static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_li typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); +#ifdef CONFIG_TRACER_SNAPSHOT /** * struct cond_snapshot - conditional snapshot data and callback * @@ -306,6 +307,7 @@ struct cond_snapshot { void *cond_data; cond_update_fn_t update; }; +#endif /* CONFIG_TRACER_SNAPSHOT */ /* * struct trace_func_repeats - used to keep track of the consecutive @@ -405,7 +407,10 @@ struct trace_array { unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; - const char *system_names; + union { + const char *system_names; + char *boot_events; + }; struct list_head err_log; struct dentry *dir; struct dentry *options; @@ -453,6 +458,12 @@ struct trace_array { * we do not waste memory on systems that are not using tracing. */ bool ring_buffer_expanded; + /* + * If the ring buffer is a read only backup instance, it will be + * removed after dumping all data via pipe, because no readable data. + */ + bool free_on_close; + struct work_struct autoremove_work; }; enum { @@ -462,6 +473,7 @@ enum { TRACE_ARRAY_FL_MOD_INIT = BIT(3), TRACE_ARRAY_FL_MEMMAP = BIT(4), TRACE_ARRAY_FL_VMALLOC = BIT(5), + TRACE_ARRAY_FL_RDONLY = BIT(6), }; #ifdef CONFIG_MODULES @@ -491,6 +503,12 @@ extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long extern struct trace_array *printk_trace; +static inline bool trace_array_is_readonly(struct trace_array *tr) +{ + /* backup instance is read only. */ + return tr->flags & TRACE_ARRAY_FL_RDONLY; +} + /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. @@ -675,6 +693,7 @@ void tracing_reset_all_online_cpus(void); void tracing_reset_all_online_cpus_unlocked(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); +int tracing_release(struct inode *inode, struct file *file); int tracing_release_generic_tr(struct inode *inode, struct file *file); int tracing_open_file_tr(struct inode *inode, struct file *filp); int tracing_release_file_tr(struct inode *inode, struct file *filp); @@ -684,12 +703,54 @@ void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); void tracer_tracing_disable(struct trace_array *tr); void tracer_tracing_enable(struct trace_array *tr); +int allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); +struct dentry *trace_create_cpu_file(const char *name, + umode_t mode, + struct dentry *parent, + void *data, + long cpu, + const struct file_operations *fops); + +struct trace_iterator *__tracing_open(struct inode *inode, struct file *file, + bool snapshot); +int tracing_buffers_open(struct inode *inode, struct file *filp); +ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos); +int tracing_buffers_release(struct inode *inode, struct file *file); +ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +ssize_t tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, + size_t cnt, loff_t *ppos); +ssize_t tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, + size_t cnt, loff_t *ppos); + +void trace_set_buffer_entries(struct array_buffer *buf, unsigned long val); +/* + * Should be used after trace_array_get(), trace_types_lock + * ensures that i_cdev was already initialized. + */ +static inline int tracing_get_cpu(struct inode *inode) +{ + if (inode->i_cdev) /* See trace_create_cpu_file() */ + return (long)inode->i_cdev - 1; + return RING_BUFFER_ALL_CPUS; +} +void tracing_reset_cpu(struct array_buffer *buf, int cpu); + +struct ftrace_buffer_info { + struct trace_iterator iter; + void *spare; + unsigned int spare_cpu; + unsigned int spare_size; + unsigned int read; +}; /** * tracer_tracing_is_on_cpu - show real state of ring buffer enabled on for a cpu @@ -806,13 +867,13 @@ void update_max_tr_single(struct trace_array *tr, #if defined(CONFIG_TRACER_MAX_TRACE) && defined(CONFIG_FSNOTIFY) # define LATENCY_FS_NOTIFY #endif +#endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef LATENCY_FS_NOTIFY void latency_fsnotify(struct trace_array *tr); #else static inline void latency_fsnotify(struct trace_array *tr) { } #endif -#endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef CONFIG_STACKTRACE void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, int skip); @@ -828,11 +889,15 @@ static inline bool tracer_uses_snapshot(struct tracer *tracer) { return tracer->use_max_tr; } +void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer); #else static inline bool tracer_uses_snapshot(struct tracer *tracer) { return false; } +static inline void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) { } #endif void trace_last_func_repeats(struct trace_array *tr, @@ -862,6 +927,8 @@ extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); +void __init trace_append_boot_param(char *buf, const char *str, + char sep, int size); extern void trace_set_ring_buffer_expanded(struct trace_array *tr); extern bool tracing_selftest_disabled; @@ -1802,11 +1869,6 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr, const char *system, const char *event); -static inline void *event_file_data(struct file *filp) -{ - return READ_ONCE(file_inode(filp)->i_private); -} - extern struct mutex event_mutex; extern struct list_head ftrace_events; @@ -1827,12 +1889,22 @@ static inline struct trace_event_file *event_file_file(struct file *filp) struct trace_event_file *file; lockdep_assert_held(&event_mutex); - file = READ_ONCE(file_inode(filp)->i_private); + file = file_inode(filp)->i_private; if (!file || file->flags & EVENT_FILE_FL_FREED) return NULL; return file; } +static inline void *event_file_data(struct file *filp) +{ + struct trace_event_file *file; + + lockdep_assert_held(&event_mutex); + file = file_inode(filp)->i_private; + WARN_ON(!file || file->flags & EVENT_FILE_FL_FREED); + return file; +} + extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; extern const struct file_operations event_hist_debug_fops; @@ -2135,12 +2207,6 @@ static inline bool event_command_needs_rec(struct event_command *cmd_ops) extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); -extern int tracing_alloc_snapshot(void); -extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); -extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); - -extern int tracing_snapshot_cond_disable(struct trace_array *tr); -extern void *tracing_cond_snapshot_data(struct trace_array *tr); extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; @@ -2228,19 +2294,71 @@ static inline void trace_event_update_all(struct trace_eval_map **map, int len) #endif #ifdef CONFIG_TRACER_SNAPSHOT +extern const struct file_operations snapshot_fops; +extern const struct file_operations snapshot_raw_fops; + +/* Used when creating instances */ +int trace_allocate_snapshot(struct trace_array *tr, int size); + +int tracing_alloc_snapshot(void); +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); +int tracing_snapshot_cond_disable(struct trace_array *tr); +void *tracing_cond_snapshot_data(struct trace_array *tr); void tracing_snapshot_instance(struct trace_array *tr); int tracing_alloc_snapshot_instance(struct trace_array *tr); +int tracing_arm_snapshot_locked(struct trace_array *tr); int tracing_arm_snapshot(struct trace_array *tr); void tracing_disarm_snapshot(struct trace_array *tr); -#else +void free_snapshot(struct trace_array *tr); +void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter); +int get_snapshot_map(struct trace_array *tr); +void put_snapshot_map(struct trace_array *tr); +int resize_buffer_duplicate_size(struct array_buffer *trace_buf, + struct array_buffer *size_buf, int cpu_id); +__init void do_allocate_snapshot(const char *name); +# ifdef CONFIG_DYNAMIC_FTRACE +__init int register_snapshot_cmd(void); +# else +static inline int register_snapshot_cmd(void) { return 0; } +# endif +#else /* !CONFIG_TRACER_SNAPSHOT */ +static inline int trace_allocate_snapshot(struct trace_array *tr, int size) { return 0; } static inline void tracing_snapshot_instance(struct trace_array *tr) { } static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) { return 0; } +static inline int tracing_arm_snapshot_locked(struct trace_array *tr) { return -EBUSY; } static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; } static inline void tracing_disarm_snapshot(struct trace_array *tr) { } -#endif +static inline void free_snapshot(struct trace_array *tr) {} +static inline void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); +} +static inline void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + return NULL; +} +static inline int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) +{ + return -ENODEV; +} +static inline int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + return false; +} +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + /* Should never be called */ + WARN_ONCE(1, "Snapshot print function called without snapshot configured"); +} +static inline int get_snapshot_map(struct trace_array *tr) { return 0; } +static inline void put_snapshot_map(struct trace_array *tr) { } +static inline void do_allocate_snapshot(const char *name) { } +static inline int register_snapshot_cmd(void) { return 0; } +#endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef CONFIG_PREEMPT_TRACER void tracer_preempt_on(unsigned long a0, unsigned long a1); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index dbe29b4c6a7a..2ca2541c8a58 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -61,7 +61,8 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node) v = memparse(p, NULL); if (v < PAGE_SIZE) pr_err("Buffer size is too small: %s\n", p); - if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0) + if (trace_array_is_readonly(tr) || + tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0) pr_err("Failed to resize trace buffer to %s\n", p); } @@ -597,7 +598,7 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node) p = xbc_node_find_value(node, "tracer", NULL); if (p && *p != '\0') { - if (tracing_set_tracer(tr, p) < 0) + if (trace_array_is_readonly(tr) || tracing_set_tracer(tr, p) < 0) pr_err("Failed to set given tracer: %s\n", p); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9928da636c9d..c46e623e7e0d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1039,6 +1039,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task) struct trace_pid_list *pid_list; struct trace_array *tr = data; + guard(preempt)(); pid_list = rcu_dereference_raw(tr->filtered_pids); trace_filter_add_remove_task(pid_list, NULL, task); @@ -1054,6 +1055,7 @@ event_filter_pid_sched_process_fork(void *data, struct trace_pid_list *pid_list; struct trace_array *tr = data; + guard(preempt)(); pid_list = rcu_dereference_sched(tr->filtered_pids); trace_filter_add_remove_task(pid_list, self, task); @@ -1399,6 +1401,9 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, { int ret; + if (trace_array_is_readonly(tr)) + return -EACCES; + mutex_lock(&event_mutex); ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod); mutex_unlock(&event_mutex); @@ -1716,7 +1721,7 @@ static int t_show_filters(struct seq_file *m, void *v) len = get_call_len(call); - seq_printf(m, "%s:%s%*.s%s\n", call->class->system, + seq_printf(m, "%s:%s%*s%s\n", call->class->system, trace_event_name(call), len, "", filter->filter_string); return 0; @@ -1748,7 +1753,7 @@ static int t_show_triggers(struct seq_file *m, void *v) len = get_call_len(call); list_for_each_entry_rcu(data, &file->triggers, list) { - seq_printf(m, "%s:%s%*.s", call->class->system, + seq_printf(m, "%s:%s%*s", call->class->system, trace_event_name(call), len, ""); data->cmd_ops->print(m, data); @@ -2182,12 +2187,12 @@ static int trace_format_open(struct inode *inode, struct file *file) static ssize_t event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - int id = (long)event_file_data(filp); + /* id is directly in i_private and available for inode's lifetime. */ + int id = (long)file_inode(filp)->i_private; char buf[32]; int len; - if (unlikely(!id)) - return -ENODEV; + WARN_ON(!id); len = sprintf(buf, "%d\n", id); @@ -2245,12 +2250,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); file = event_file_file(filp); - if (file) { - if (file->flags & EVENT_FILE_FL_FREED) - err = -ENODEV; - else - err = apply_event_filter(file, buf); - } + if (file) + err = apply_event_filter(file, buf); mutex_unlock(&event_mutex); kfree(buf); @@ -2971,8 +2972,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - /* ftrace only has directories no files */ - if (strcmp(name, "ftrace") == 0) + /* ftrace only has directories no files, readonly instance too. */ + if (strcmp(name, "ftrace") == 0 || trace_array_is_readonly(tr)) nr_entries = 0; else nr_entries = ARRAY_SIZE(system_entries); @@ -3137,28 +3138,30 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) int ret; static struct eventfs_entry event_entries[] = { { - .name = "enable", + .name = "format", .callback = event_callback, - .release = event_release, }, +#ifdef CONFIG_PERF_EVENTS { - .name = "filter", + .name = "id", .callback = event_callback, }, +#endif +#define NR_RO_EVENT_ENTRIES (1 + IS_ENABLED(CONFIG_PERF_EVENTS)) +/* Readonly files must be above this line and counted by NR_RO_EVENT_ENTRIES. */ { - .name = "trigger", + .name = "enable", .callback = event_callback, + .release = event_release, }, { - .name = "format", + .name = "filter", .callback = event_callback, }, -#ifdef CONFIG_PERF_EVENTS { - .name = "id", + .name = "trigger", .callback = event_callback, }, -#endif #ifdef CONFIG_HIST_TRIGGERS { .name = "hist", @@ -3191,7 +3194,10 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file) if (!e_events) return -ENOMEM; - nr_entries = ARRAY_SIZE(event_entries); + if (trace_array_is_readonly(tr)) + nr_entries = NR_RO_EVENT_ENTRIES; + else + nr_entries = ARRAY_SIZE(event_entries); name = trace_event_name(call); ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file); @@ -3677,20 +3683,27 @@ static struct boot_triggers { } bootup_triggers[MAX_BOOT_TRIGGERS]; static char bootup_trigger_buf[COMMAND_LINE_SIZE]; +static int boot_trigger_buf_len; static int nr_boot_triggers; static __init int setup_trace_triggers(char *str) { char *trigger; char *buf; + int len = boot_trigger_buf_len; int i; - strscpy(bootup_trigger_buf, str, COMMAND_LINE_SIZE); + if (len >= COMMAND_LINE_SIZE) + return 1; + + strscpy(bootup_trigger_buf + len, str, COMMAND_LINE_SIZE - len); trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event triggers"); - buf = bootup_trigger_buf; - for (i = 0; i < MAX_BOOT_TRIGGERS; i++) { + buf = bootup_trigger_buf + len; + boot_trigger_buf_len += strlen(buf) + 1; + + for (i = nr_boot_triggers; i < MAX_BOOT_TRIGGERS; i++) { trigger = strsep(&buf, ","); if (!trigger) break; @@ -4491,7 +4504,11 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { - strscpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + if (bootup_event_buf[0] != '\0') + strlcat(bootup_event_buf, ",", COMMAND_LINE_SIZE); + + strlcat(bootup_event_buf, str, COMMAND_LINE_SIZE); + trace_set_ring_buffer_expanded(NULL); disable_tracing_selftest("running event tracing"); @@ -4530,31 +4547,44 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) int nr_entries; static struct eventfs_entry events_entries[] = { { - .name = "enable", + .name = "header_page", .callback = events_callback, }, { - .name = "header_page", + .name = "header_event", .callback = events_callback, }, +#define NR_RO_TOP_ENTRIES 2 +/* Readonly files must be above this line and counted by NR_RO_TOP_ENTRIES. */ { - .name = "header_event", + .name = "enable", .callback = events_callback, }, }; - entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_fops); - if (!entry) - return -ENOMEM; + if (!trace_array_is_readonly(tr)) { + entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_fops); + if (!entry) + return -ENOMEM; - trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr, - &ftrace_show_event_filters_fops); + /* There are not as crucial, just warn if they are not created */ + trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr, + &ftrace_show_event_filters_fops); - trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr, - &ftrace_show_event_triggers_fops); + trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr, + &ftrace_show_event_triggers_fops); - nr_entries = ARRAY_SIZE(events_entries); + trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, + tr, &ftrace_set_event_pid_fops); + + trace_create_file("set_event_notrace_pid", + TRACE_MODE_WRITE, parent, tr, + &ftrace_set_event_notrace_pid_fops); + nr_entries = ARRAY_SIZE(events_entries); + } else { + nr_entries = NR_RO_TOP_ENTRIES; + } e_events = eventfs_create_events_dir("events", parent, events_entries, nr_entries, tr); @@ -4563,15 +4593,6 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; } - /* There are not as crucial, just warn if they are not created */ - - trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent, - tr, &ftrace_set_event_pid_fops); - - trace_create_file("set_event_notrace_pid", - TRACE_MODE_WRITE, parent, tr, - &ftrace_set_event_notrace_pid_fops); - tr->event_dir = e_events; return 0; @@ -4668,26 +4689,22 @@ static __init int event_trace_memsetup(void) return 0; } -__init void -early_enable_events(struct trace_array *tr, char *buf, bool disable_first) +/* + * Helper function to enable or disable a comma-separated list of events + * from the bootup buffer. + */ +static __init void __early_set_events(struct trace_array *tr, char *buf, bool enable) { char *token; - int ret; - - while (true) { - token = strsep(&buf, ","); - - if (!token) - break; + while ((token = strsep(&buf, ","))) { if (*token) { - /* Restarting syscalls requires that we stop them first */ - if (disable_first) + if (enable) { + if (ftrace_set_clr_event(tr, token, 1)) + pr_warn("Failed to enable trace event: %s\n", token); + } else { ftrace_set_clr_event(tr, token, 0); - - ret = ftrace_set_clr_event(tr, token, 1); - if (ret) - pr_warn("Failed to enable trace event: %s\n", token); + } } /* Put back the comma to allow this to be called again */ @@ -4696,6 +4713,32 @@ early_enable_events(struct trace_array *tr, char *buf, bool disable_first) } } +/** + * early_enable_events - enable events from the bootup buffer + * @tr: The trace array to enable the events in + * @buf: The buffer containing the comma separated list of events + * @disable_first: If true, disable all events in @buf before enabling them + * + * This function enables events from the bootup buffer. If @disable_first + * is true, it will first disable all events in the buffer before enabling + * them. + * + * For syscall events, which rely on a global refcount to register the + * SYSCALL_WORK_SYSCALL_TRACEPOINT flag (especially for pid 1), we must + * ensure the refcount hits zero before re-enabling them. A simple + * "disable then enable" per-event is not enough if multiple syscalls are + * used, as the refcount will stay above zero. Thus, we need a two-phase + * approach: disable all, then enable all. + */ +__init void +early_enable_events(struct trace_array *tr, char *buf, bool disable_first) +{ + if (disable_first) + __early_set_events(tr, buf, false); + + __early_set_events(tr, buf, true); +} + static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 73ea180cad55..0dbbf6cca9bc 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1361,12 +1361,17 @@ static const char *hist_field_name(struct hist_field *field, field->flags & HIST_FIELD_FL_VAR_REF) { if (field->system) { static char full_name[MAX_FILTER_STR_VAL]; + static char *fmt; + int len; + + fmt = field->flags & HIST_FIELD_FL_VAR_REF ? "%s.%s.$%s" : "%s.%s.%s"; + + len = snprintf(full_name, sizeof(full_name), fmt, + field->system, field->event_name, + field->name); + if (len >= sizeof(full_name)) + return NULL; - strcat(full_name, field->system); - strcat(full_name, "."); - strcat(full_name, field->event_name); - strcat(full_name, "."); - strcat(full_name, field->name); field_name = full_name; } else field_name = field->name; @@ -1740,9 +1745,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) static void expr_field_str(struct hist_field *field, char *expr) { - if (field->flags & HIST_FIELD_FL_VAR_REF) - strcat(expr, "$"); - else if (field->flags & HIST_FIELD_FL_CONST) { + if (field->flags & HIST_FIELD_FL_VAR_REF) { + if (!field->system) + strcat(expr, "$"); + } else if (field->flags & HIST_FIELD_FL_CONST) { char str[HIST_CONST_DIGITS_MAX]; snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant); @@ -5836,8 +5842,6 @@ static int event_hist_open(struct inode *inode, struct file *file) hist_file->file = file; hist_file->last_act = get_hist_hit_count(event_file); - /* Clear private_data to avoid warning in single_open() */ - file->private_data = NULL; ret = single_open(file, hist_show, hist_file); if (ret) { kfree(hist_file); @@ -6126,8 +6130,6 @@ static int event_hist_debug_open(struct inode *inode, struct file *file) if (ret) return ret; - /* Clear private_data to avoid warning in single_open() */ - file->private_data = NULL; ret = single_open(file, hist_debug_show, file); if (ret) tracing_release_file_tr(inode, file); @@ -6158,7 +6160,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) else if (field_name) { if (hist_field->flags & HIST_FIELD_FL_VAR_REF || hist_field->flags & HIST_FIELD_FL_ALIAS) - seq_putc(m, '$'); + if (!hist_field->system) + seq_putc(m, '$'); seq_printf(m, "%s", field_name); } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) seq_puts(m, "common_timestamp"); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 8bb95b2a6fcf..39ac4eba0702 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -395,7 +395,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, n_u64++; } else { struct trace_print_flags __flags[] = { - __def_gfpflag_names, {-1, NULL} }; + __def_gfpflag_names }; char *space = (i == se->n_fields - 1 ? "" : " "); print_synth_event_num_val(s, print_fmt, @@ -408,7 +408,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, trace_seq_puts(s, " ("); trace_print_flags_seq(s, "|", entry->fields[n_u64].as_u64, - __flags); + __flags, ARRAY_SIZE(__flags)); trace_seq_putc(s, ')'); } n_u64++; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index fecbd679d432..655db2e82513 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -22,6 +22,39 @@ static struct task_struct *trigger_kthread; static struct llist_head trigger_data_free_list; static DEFINE_MUTEX(trigger_data_kthread_mutex); +static int trigger_kthread_fn(void *ignore); + +static void trigger_create_kthread_locked(void) +{ + lockdep_assert_held(&trigger_data_kthread_mutex); + + if (!trigger_kthread) { + struct task_struct *kthread; + + kthread = kthread_create(trigger_kthread_fn, NULL, + "trigger_data_free"); + if (!IS_ERR(kthread)) + WRITE_ONCE(trigger_kthread, kthread); + } +} + +static void trigger_data_free_queued_locked(void) +{ + struct event_trigger_data *data, *tmp; + struct llist_node *llnodes; + + lockdep_assert_held(&trigger_data_kthread_mutex); + + llnodes = llist_del_all(&trigger_data_free_list); + if (!llnodes) + return; + + tracepoint_synchronize_unregister(); + + llist_for_each_entry_safe(data, tmp, llnodes, llist) + kfree(data); +} + /* Bulk garbage collection of event_trigger_data elements */ static int trigger_kthread_fn(void *ignore) { @@ -50,33 +83,56 @@ static int trigger_kthread_fn(void *ignore) void trigger_data_free(struct event_trigger_data *data) { + if (!data) + return; + if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); + /* + * Boot-time trigger registration can fail before kthread creation + * works. Keep the deferred-free semantics during boot and let late + * init start the kthread to drain the list. + */ + if (system_state == SYSTEM_BOOTING && !trigger_kthread) { + llist_add(&data->llist, &trigger_data_free_list); + return; + } + if (unlikely(!trigger_kthread)) { guard(mutex)(&trigger_data_kthread_mutex); + + trigger_create_kthread_locked(); /* Check again after taking mutex */ if (!trigger_kthread) { - struct task_struct *kthread; - - kthread = kthread_create(trigger_kthread_fn, NULL, - "trigger_data_free"); - if (!IS_ERR(kthread)) - WRITE_ONCE(trigger_kthread, kthread); + llist_add(&data->llist, &trigger_data_free_list); + /* Drain the queued frees synchronously if creation failed. */ + trigger_data_free_queued_locked(); + return; } } - if (!trigger_kthread) { - /* Do it the slow way */ - tracepoint_synchronize_unregister(); - kfree(data); - return; - } - llist_add(&data->llist, &trigger_data_free_list); wake_up_process(trigger_kthread); } +static int __init trigger_data_free_init(void) +{ + guard(mutex)(&trigger_data_kthread_mutex); + + if (llist_empty(&trigger_data_free_list)) + return 0; + + trigger_create_kthread_locked(); + if (trigger_kthread) + wake_up_process(trigger_kthread); + else + trigger_data_free_queued_locked(); + + return 0; +} +late_initcall(trigger_data_free_init); + static inline void data_ops_trigger(struct event_trigger_data *data, struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3d8239fee004..0d2d3a2ea7dd 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -400,14 +400,19 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, struct ftrace_regs *fregs) { + unsigned long *task_var = fgraph_get_task_var(gops); struct fgraph_times *ftimes; struct trace_array *tr; + unsigned int trace_ctx; + u64 calltime, rettime; int size; + rettime = trace_clock_local(); + ftrace_graph_addr_finish(gops, trace); - if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) { - trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT); + if (*task_var & TRACE_GRAPH_NOTRACE) { + *task_var &= ~TRACE_GRAPH_NOTRACE; return; } @@ -418,11 +423,13 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, tr = gops->private; handle_nosleeptime(tr, trace, ftimes, size); - if (tracing_thresh && - (trace_clock_local() - ftimes->calltime < tracing_thresh)) + calltime = ftimes->calltime; + + if (tracing_thresh && (rettime - calltime < tracing_thresh)) return; - else - trace_graph_return(trace, gops, fregs); + + trace_ctx = tracing_gen_ctx(); + __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); } static struct fgraph_ops funcgraph_ops = { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a5dbb72528e0..a8420e6abb56 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -31,7 +31,8 @@ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; static int __init set_kprobe_boot_events(char *str) { - strscpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + trace_append_boot_param(kprobe_boot_events_buf, str, ';', + COMMAND_LINE_SIZE); disable_tracing_selftest("running kprobe events"); return 1; @@ -765,6 +766,14 @@ static unsigned int number_of_same_symbols(const char *mod, const char *func_nam if (!mod) kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + /* + * If the symbol is found in vmlinux, use vmlinux resolution only. + * This prevents module symbols from shadowing vmlinux symbols + * and causing -EADDRNOTAVAIL for unqualified kprobe targets. + */ + if (!mod && ctx.count > 0) + return ctx.count; + module_kallsyms_on_each_symbol(mod, count_mod_symbols, &ctx); return ctx.count; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index dee610e465b9..75678053b21c 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -58,6 +58,7 @@ enum osnoise_options_index { OSN_PANIC_ON_STOP, OSN_PREEMPT_DISABLE, OSN_IRQ_DISABLE, + OSN_TIMERLAT_ALIGN, OSN_MAX }; @@ -66,7 +67,8 @@ static const char * const osnoise_options_str[OSN_MAX] = { "OSNOISE_WORKLOAD", "PANIC_ON_STOP", "OSNOISE_PREEMPT_DISABLE", - "OSNOISE_IRQ_DISABLE" }; + "OSNOISE_IRQ_DISABLE", + "TIMERLAT_ALIGN" }; #define OSN_DEFAULT_OPTIONS 0x2 static unsigned long osnoise_options = OSN_DEFAULT_OPTIONS; @@ -251,6 +253,11 @@ struct timerlat_variables { static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var); /* + * timerlat wake-up offset for next thread with TIMERLAT_ALIGN set. + */ +static atomic64_t align_next; + +/* * this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU */ static inline struct timerlat_variables *this_cpu_tmr_var(void) @@ -268,6 +275,7 @@ static inline void tlat_var_reset(void) /* Synchronize with the timerlat interfaces */ mutex_lock(&interface_lock); + /* * So far, all the values are initialized as 0, so * zeroing the structure is perfect. @@ -278,6 +286,12 @@ static inline void tlat_var_reset(void) hrtimer_cancel(&tlat_var->timer); memset(tlat_var, 0, sizeof(*tlat_var)); } + /* + * Reset also align_next, to be filled by a new offset by the first timerlat + * thread that wakes up, if TIMERLAT_ALIGN is set. + */ + atomic64_set(&align_next, 0); + mutex_unlock(&interface_lock); } #else /* CONFIG_TIMERLAT_TRACER */ @@ -326,6 +340,7 @@ static struct osnoise_data { u64 stop_tracing_total; /* stop trace in the final operation (report/thread) */ #ifdef CONFIG_TIMERLAT_TRACER u64 timerlat_period; /* timerlat period */ + u64 timerlat_align_us; /* timerlat alignment */ u64 print_stack; /* print IRQ stack if total > */ int timerlat_tracer; /* timerlat tracer */ #endif @@ -338,6 +353,7 @@ static struct osnoise_data { #ifdef CONFIG_TIMERLAT_TRACER .print_stack = 0, .timerlat_period = DEFAULT_TIMERLAT_PERIOD, + .timerlat_align_us = 0, .timerlat_tracer = 0, #endif }; @@ -1830,6 +1846,26 @@ static int wait_next_period(struct timerlat_variables *tlat) tlat->abs_period = (u64) ktime_to_ns(next_abs_period); /* + * Align thread in the first cycle on each CPU to the set alignment + * if TIMERLAT_ALIGN is set. + * + * This is done by using an atomic64_t to store the next absolute period. + * The first thread that wakes up will set the atomic64_t to its + * absolute period, and the other threads will increment it by + * the alignment value. + */ + if (test_bit(OSN_TIMERLAT_ALIGN, &osnoise_options) && !tlat->count + && atomic64_cmpxchg_relaxed(&align_next, 0, tlat->abs_period)) { + /* + * A thread has already set align_next, use it and increment it + * to be used by the next thread that wakes up after this one. + */ + tlat->abs_period = atomic64_add_return_relaxed( + osnoise_data.timerlat_align_us * 1000, &align_next); + next_abs_period = ns_to_ktime(tlat->abs_period); + } + + /* * If the new abs_period is in the past, skip the activation. */ while (ktime_compare(now, next_abs_period) > 0) { @@ -2073,8 +2109,8 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) if (!osnoise_has_registered_instances()) return; - guard(mutex)(&interface_lock); guard(cpus_read_lock)(); + guard(mutex)(&interface_lock); if (!cpu_online(cpu)) return; @@ -2237,11 +2273,11 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, if (running) stop_per_cpu_kthreads(); - mutex_lock(&interface_lock); /* * avoid CPU hotplug operations that might read options. */ cpus_read_lock(); + mutex_lock(&interface_lock); retval = cnt; @@ -2257,8 +2293,8 @@ static ssize_t osnoise_options_write(struct file *filp, const char __user *ubuf, clear_bit(option, &osnoise_options); } - cpus_read_unlock(); mutex_unlock(&interface_lock); + cpus_read_unlock(); if (running) start_per_cpu_kthreads(); @@ -2345,16 +2381,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, if (running) stop_per_cpu_kthreads(); - mutex_lock(&interface_lock); /* * osnoise_cpumask is read by CPU hotplug operations. */ cpus_read_lock(); + mutex_lock(&interface_lock); cpumask_copy(&osnoise_cpumask, osnoise_cpumask_new); - cpus_read_unlock(); mutex_unlock(&interface_lock); + cpus_read_unlock(); if (running) start_per_cpu_kthreads(); @@ -2650,6 +2686,17 @@ static struct trace_min_max_param timerlat_period = { .min = &timerlat_min_period, }; +/* + * osnoise/timerlat_align_us: align the first wakeup of all timerlat + * threads to a common boundary (in us). 0 means disabled. + */ +static struct trace_min_max_param timerlat_align_us = { + .lock = &interface_lock, + .val = &osnoise_data.timerlat_align_us, + .max = NULL, + .min = NULL, +}; + static const struct file_operations timerlat_fd_fops = { .open = timerlat_fd_open, .read = timerlat_fd_read, @@ -2746,6 +2793,11 @@ static int init_timerlat_tracefs(struct dentry *top_dir) if (!tmp) return -ENOMEM; + tmp = tracefs_create_file("timerlat_align_us", TRACE_MODE_WRITE, top_dir, + &timerlat_align_us, &trace_min_max_fops); + if (!tmp) + return -ENOMEM; + retval = osnoise_create_cpu_timerlat_fd(top_dir); if (retval) return retval; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 1996d7aba038..a5ad76175d10 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -69,14 +69,15 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) const char * trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, - const struct trace_print_flags *flag_array) + const struct trace_print_flags *flag_array, + size_t flag_array_size) { unsigned long mask; const char *str; const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; - for (i = 0; flag_array[i].name && flags; i++) { + for (i = 0; i < flag_array_size && flags; i++) { mask = flag_array[i].mask; if ((flags & mask) != mask) @@ -106,12 +107,13 @@ EXPORT_SYMBOL(trace_print_flags_seq); const char * trace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array) + const struct trace_print_flags *symbol_array, + size_t symbol_array_size) { int i; const char *ret = trace_seq_buffer_ptr(p); - for (i = 0; symbol_array[i].name; i++) { + for (i = 0; i < symbol_array_size; i++) { if (val != symbol_array[i].mask) continue; @@ -133,14 +135,15 @@ EXPORT_SYMBOL(trace_print_symbols_seq); const char * trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, - const struct trace_print_flags_u64 *flag_array) + const struct trace_print_flags_u64 *flag_array, + size_t flag_array_size) { unsigned long long mask; const char *str; const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; - for (i = 0; flag_array[i].name && flags; i++) { + for (i = 0; i < flag_array_size && flags; i++) { mask = flag_array[i].mask; if ((flags & mask) != mask) @@ -170,12 +173,13 @@ EXPORT_SYMBOL(trace_print_flags_seq_u64); const char * trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, - const struct trace_print_flags_u64 *symbol_array) + const struct trace_print_flags_u64 *symbol_array, + size_t symbol_array_size) { int i; const char *ret = trace_seq_buffer_ptr(p); - for (i = 0; symbol_array[i].name; i++) { + for (i = 0; i < symbol_array_size; i++) { if (val != symbol_array[i].mask) continue; @@ -719,12 +723,13 @@ void print_function_args(struct trace_seq *s, unsigned long *args, { const struct btf_param *param; const struct btf_type *t; + const struct btf_enum *enums; const char *param_name; char name[KSYM_NAME_LEN]; unsigned long arg; struct btf *btf; s32 tid, nr = 0; - int a, p, x; + int a, p, x, i; u16 encode; trace_seq_printf(s, "("); @@ -778,6 +783,15 @@ void print_function_args(struct trace_seq *s, unsigned long *args, break; case BTF_KIND_ENUM: trace_seq_printf(s, "%ld", arg); + enums = btf_enum(t); + for (i = 0; i < btf_vlen(t); i++) { + if (arg == enums[i].val) { + trace_seq_printf(s, " [%s]", + btf_name_by_offset(btf, + enums[i].name_off)); + break; + } + } break; default: /* This does not handle complex arguments */ diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 5ea5e0d76f00..3ea17af60169 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -197,6 +197,7 @@ struct notifier_block module_trace_bprintk_format_nb = { .notifier_call = module_trace_bprintk_format_notify, }; +__printf(2, 3) int __trace_bprintk(unsigned long ip, const char *fmt, ...) { int ret; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e0a5dc86c07e..e1c73065dae5 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1068,7 +1068,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs) { size_t len = strlen(str); - if (str[len - 1] != '"') { + if (!len || str[len - 1] != '"') { trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE); return -EINVAL; } diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c new file mode 100644 index 000000000000..d6c3f94d67cd --- /dev/null +++ b/kernel/trace/trace_remote.c @@ -0,0 +1,1384 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort <vdonnefort@google.com> + */ + +#include <linux/kstrtox.h> +#include <linux/lockdep.h> +#include <linux/mutex.h> +#include <linux/tracefs.h> +#include <linux/trace_remote.h> +#include <linux/trace_seq.h> +#include <linux/types.h> + +#include "trace.h" + +#define TRACEFS_DIR "remotes" +#define TRACEFS_MODE_WRITE 0640 +#define TRACEFS_MODE_READ 0440 + +enum tri_type { + TRI_CONSUMING, + TRI_NONCONSUMING, +}; + +struct trace_remote_iterator { + struct trace_remote *remote; + struct trace_seq seq; + struct delayed_work poll_work; + unsigned long lost_events; + u64 ts; + struct ring_buffer_iter *rb_iter; + struct ring_buffer_iter **rb_iters; + struct remote_event_hdr *evt; + int cpu; + int evt_cpu; + loff_t pos; + enum tri_type type; +}; + +struct trace_remote { + struct trace_remote_callbacks *cbs; + void *priv; + struct trace_buffer *trace_buffer; + struct trace_buffer_desc *trace_buffer_desc; + struct dentry *dentry; + struct eventfs_inode *eventfs; + struct remote_event *events; + unsigned long nr_events; + unsigned long trace_buffer_size; + struct ring_buffer_remote rb_remote; + struct mutex lock; + struct rw_semaphore reader_lock; + struct rw_semaphore *pcpu_reader_locks; + unsigned int nr_readers; + unsigned int poll_ms; + bool tracing_on; +}; + +static bool trace_remote_loaded(struct trace_remote *remote) +{ + return !!remote->trace_buffer; +} + +static int trace_remote_load(struct trace_remote *remote) +{ + struct ring_buffer_remote *rb_remote = &remote->rb_remote; + struct trace_buffer_desc *desc; + + lockdep_assert_held(&remote->lock); + + if (trace_remote_loaded(remote)) + return 0; + + desc = remote->cbs->load_trace_buffer(remote->trace_buffer_size, remote->priv); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + rb_remote->desc = desc; + rb_remote->swap_reader_page = remote->cbs->swap_reader_page; + rb_remote->priv = remote->priv; + rb_remote->reset = remote->cbs->reset; + remote->trace_buffer = ring_buffer_alloc_remote(rb_remote); + if (!remote->trace_buffer) { + remote->cbs->unload_trace_buffer(desc, remote->priv); + return -ENOMEM; + } + + remote->trace_buffer_desc = desc; + + return 0; +} + +static void trace_remote_try_unload(struct trace_remote *remote) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + /* The buffer is being read or writable */ + if (remote->nr_readers || remote->tracing_on) + return; + + /* The buffer has readable data */ + if (!ring_buffer_empty(remote->trace_buffer)) + return; + + ring_buffer_free(remote->trace_buffer); + remote->trace_buffer = NULL; + remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv); +} + +static int trace_remote_enable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (remote->tracing_on) + return 0; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + ret = remote->cbs->enable_tracing(true, remote->priv); + if (ret) { + trace_remote_try_unload(remote); + return ret; + } + + remote->tracing_on = true; + + return 0; +} + +static int trace_remote_disable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (!remote->tracing_on) + return 0; + + ret = remote->cbs->enable_tracing(false, remote->priv); + if (ret) + return ret; + + ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS); + remote->tracing_on = false; + trace_remote_try_unload(remote); + + return 0; +} + +static void trace_remote_reset(struct trace_remote *remote, int cpu) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + if (cpu == RING_BUFFER_ALL_CPUS) + ring_buffer_reset(remote->trace_buffer); + else + ring_buffer_reset_cpu(remote->trace_buffer, cpu); + + trace_remote_try_unload(remote); +} + +static ssize_t +tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = val ? trace_remote_enable_tracing(remote) : trace_remote_disable_tracing(remote); + if (ret) + return ret; + + return cnt; +} +static int tracing_on_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%d\n", remote->tracing_on); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on); + +static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* KiB to Bytes */ + if (!val || check_shl_overflow(val, 10, &val)) + return -EINVAL; + + guard(mutex)(&remote->lock); + + if (trace_remote_loaded(remote)) + return -EBUSY; + + remote->trace_buffer_size = val; + + return cnt; +} + +static int buffer_size_kb_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%lu (%s)\n", remote->trace_buffer_size >> 10, + trace_remote_loaded(remote) ? "loaded" : "unloaded"); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(buffer_size_kb); + +static int trace_remote_get(struct trace_remote *remote, int cpu) +{ + int ret; + + if (remote->nr_readers == UINT_MAX) + return -EBUSY; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + if (cpu != RING_BUFFER_ALL_CPUS && !remote->pcpu_reader_locks) { + int lock_cpu; + + remote->pcpu_reader_locks = kcalloc(nr_cpu_ids, sizeof(*remote->pcpu_reader_locks), + GFP_KERNEL); + if (!remote->pcpu_reader_locks) { + trace_remote_try_unload(remote); + return -ENOMEM; + } + + for_each_possible_cpu(lock_cpu) + init_rwsem(&remote->pcpu_reader_locks[lock_cpu]); + } + + remote->nr_readers++; + + return 0; +} + +static void trace_remote_put(struct trace_remote *remote) +{ + if (WARN_ON(!remote->nr_readers)) + return; + + remote->nr_readers--; + if (remote->nr_readers) + return; + + kfree(remote->pcpu_reader_locks); + remote->pcpu_reader_locks = NULL; + + trace_remote_try_unload(remote); +} + +static bool trace_remote_has_cpu(struct trace_remote *remote, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) + return true; + + return ring_buffer_poll_remote(remote->trace_buffer, cpu) == 0; +} + +static void __poll_remote(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct trace_remote_iterator *iter; + + iter = container_of(dwork, struct trace_remote_iterator, poll_work); + ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu); + schedule_delayed_work((struct delayed_work *)work, + msecs_to_jiffies(iter->remote->poll_ms)); +} + +static void __free_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu) +{ + if (cpu != RING_BUFFER_ALL_CPUS) { + ring_buffer_read_finish(iter->rb_iter); + return; + } + + for_each_possible_cpu(cpu) { + if (iter->rb_iters[cpu]) + ring_buffer_read_finish(iter->rb_iters[cpu]); + } + + kfree(iter->rb_iters); +} + +static int __alloc_ring_buffer_iter(struct trace_remote_iterator *iter, int cpu) +{ + if (cpu != RING_BUFFER_ALL_CPUS) { + iter->rb_iter = ring_buffer_read_start(iter->remote->trace_buffer, cpu, GFP_KERNEL); + + return iter->rb_iter ? 0 : -ENOMEM; + } + + iter->rb_iters = kcalloc(nr_cpu_ids, sizeof(*iter->rb_iters), GFP_KERNEL); + if (!iter->rb_iters) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + iter->rb_iters[cpu] = ring_buffer_read_start(iter->remote->trace_buffer, cpu, + GFP_KERNEL); + if (!iter->rb_iters[cpu]) { + /* This CPU isn't part of trace_buffer. Skip it */ + if (!trace_remote_has_cpu(iter->remote, cpu)) + continue; + + __free_ring_buffer_iter(iter, RING_BUFFER_ALL_CPUS); + return -ENOMEM; + } + } + + return 0; +} + +static struct trace_remote_iterator +*trace_remote_iter(struct trace_remote *remote, int cpu, enum tri_type type) +{ + struct trace_remote_iterator *iter = NULL; + int ret; + + lockdep_assert_held(&remote->lock); + + if (type == TRI_NONCONSUMING && !trace_remote_loaded(remote)) + return NULL; + + ret = trace_remote_get(remote, cpu); + if (ret) + return ERR_PTR(ret); + + if (!trace_remote_has_cpu(remote, cpu)) { + ret = -ENODEV; + goto err; + } + + iter = kzalloc_obj(*iter); + if (iter) { + iter->remote = remote; + iter->cpu = cpu; + iter->type = type; + trace_seq_init(&iter->seq); + + switch (type) { + case TRI_CONSUMING: + ring_buffer_poll_remote(remote->trace_buffer, cpu); + INIT_DELAYED_WORK(&iter->poll_work, __poll_remote); + schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms)); + break; + case TRI_NONCONSUMING: + ret = __alloc_ring_buffer_iter(iter, cpu); + break; + } + + if (ret) + goto err; + + return iter; + } + ret = -ENOMEM; + +err: + kfree(iter); + trace_remote_put(remote); + + return ERR_PTR(ret); +} + +static void trace_remote_iter_free(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote; + + if (!iter) + return; + + remote = iter->remote; + + lockdep_assert_held(&remote->lock); + + switch (iter->type) { + case TRI_CONSUMING: + cancel_delayed_work_sync(&iter->poll_work); + break; + case TRI_NONCONSUMING: + __free_ring_buffer_iter(iter, iter->cpu); + break; + } + + kfree(iter); + trace_remote_put(remote); +} + +static void trace_remote_iter_read_start(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote = iter->remote; + int cpu = iter->cpu; + + /* Acquire global reader lock */ + if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING) + down_write(&remote->reader_lock); + else + down_read(&remote->reader_lock); + + if (cpu == RING_BUFFER_ALL_CPUS) + return; + + /* + * No need for the remote lock here, iter holds a reference on + * remote->nr_readers + */ + + /* Get the per-CPU one */ + if (WARN_ON_ONCE(!remote->pcpu_reader_locks)) + return; + + if (iter->type == TRI_CONSUMING) + down_write(&remote->pcpu_reader_locks[cpu]); + else + down_read(&remote->pcpu_reader_locks[cpu]); +} + +static void trace_remote_iter_read_finished(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote = iter->remote; + int cpu = iter->cpu; + + /* Release per-CPU reader lock */ + if (cpu != RING_BUFFER_ALL_CPUS) { + /* + * No need for the remote lock here, iter holds a reference on + * remote->nr_readers + */ + if (iter->type == TRI_CONSUMING) + up_write(&remote->pcpu_reader_locks[cpu]); + else + up_read(&remote->pcpu_reader_locks[cpu]); + } + + /* Release global reader lock */ + if (cpu == RING_BUFFER_ALL_CPUS && iter->type == TRI_CONSUMING) + up_write(&remote->reader_lock); + else + up_read(&remote->reader_lock); +} + +static struct ring_buffer_iter *__get_rb_iter(struct trace_remote_iterator *iter, int cpu) +{ + return iter->cpu != RING_BUFFER_ALL_CPUS ? iter->rb_iter : iter->rb_iters[cpu]; +} + +static struct ring_buffer_event * +__peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long *lost_events) +{ + struct ring_buffer_event *rb_evt; + struct ring_buffer_iter *rb_iter; + + switch (iter->type) { + case TRI_CONSUMING: + return ring_buffer_peek(iter->remote->trace_buffer, cpu, ts, lost_events); + case TRI_NONCONSUMING: + rb_iter = __get_rb_iter(iter, cpu); + if (!rb_iter) + return NULL; + + rb_evt = ring_buffer_iter_peek(rb_iter, ts); + if (!rb_evt) + return NULL; + + *lost_events = ring_buffer_iter_dropped(rb_iter); + + return rb_evt; + } + + return NULL; +} + +static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) +{ + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + struct ring_buffer_event *rb_evt; + int cpu = iter->cpu; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + return false; + + rb_evt = __peek_event(iter, cpu, &iter->ts, &iter->lost_events); + if (!rb_evt) + return false; + + iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); + return true; + } + + iter->ts = U64_MAX; + for_each_possible_cpu(cpu) { + unsigned long lost_events; + u64 ts; + + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + continue; + + rb_evt = __peek_event(iter, cpu, &ts, &lost_events); + if (!rb_evt) + continue; + + if (ts >= iter->ts) + continue; + + iter->ts = ts; + iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); + iter->lost_events = lost_events; + } + + return iter->ts != U64_MAX; +} + +static void trace_remote_iter_move(struct trace_remote_iterator *iter) +{ + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + + switch (iter->type) { + case TRI_CONSUMING: + ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL); + break; + case TRI_NONCONSUMING: + ring_buffer_iter_advance(__get_rb_iter(iter, iter->evt_cpu)); + break; + } +} + +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id); + +static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) +{ + struct remote_event *evt; + unsigned long usecs_rem; + u64 ts = iter->ts; + + if (iter->lost_events) + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->evt_cpu, iter->lost_events); + + do_div(ts, 1000); + usecs_rem = do_div(ts, USEC_PER_SEC); + + trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu, + ts, usecs_rem); + + evt = trace_remote_find_event(iter->remote, iter->evt->id); + if (!evt) + trace_seq_printf(&iter->seq, "UNKNOWN id=%d\n", iter->evt->id); + else + evt->print(iter->evt, &iter->seq); + + return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0; +} + +static int trace_pipe_open(struct inode *inode, struct file *filp) +{ + struct trace_remote *remote = inode->i_private; + struct trace_remote_iterator *iter; + int cpu = tracing_get_cpu(inode); + + guard(mutex)(&remote->lock); + + iter = trace_remote_iter(remote, cpu, TRI_CONSUMING); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + filp->private_data = iter; + + return IS_ERR(iter) ? PTR_ERR(iter) : 0; +} + +static int trace_pipe_release(struct inode *inode, struct file *filp) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_remote *remote = iter->remote; + + guard(mutex)(&remote->lock); + + trace_remote_iter_free(iter); + + return 0; +} + +static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + int ret; + +copy_to_user: + ret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (ret != -EBUSY) + return ret; + + trace_seq_init(&iter->seq); + + ret = ring_buffer_wait(trace_buffer, iter->cpu, 0, NULL, NULL); + if (ret < 0) + return ret; + + trace_remote_iter_read_start(iter); + + while (trace_remote_iter_read_event(iter)) { + int prev_len = iter->seq.seq.len; + + if (trace_remote_iter_print_event(iter)) { + iter->seq.seq.len = prev_len; + break; + } + + trace_remote_iter_move(iter); + } + + trace_remote_iter_read_finished(iter); + + goto copy_to_user; +} + +static const struct file_operations trace_pipe_fops = { + .open = trace_pipe_open, + .read = trace_pipe_read, + .release = trace_pipe_release, +}; + +static void *trace_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_remote_iterator *iter = m->private; + + ++*pos; + + if (!iter || !trace_remote_iter_read_event(iter)) + return NULL; + + trace_remote_iter_move(iter); + iter->pos++; + + return iter; +} + +static void *trace_start(struct seq_file *m, loff_t *pos) +{ + struct trace_remote_iterator *iter = m->private; + loff_t i; + + if (!iter) + return NULL; + + trace_remote_iter_read_start(iter); + + if (!*pos) { + iter->pos = -1; + return trace_next(m, NULL, &i); + } + + i = iter->pos; + while (i < *pos) { + iter = trace_next(m, NULL, &i); + if (!iter) + return NULL; + } + + return iter; +} + +static int trace_show(struct seq_file *m, void *v) +{ + struct trace_remote_iterator *iter = v; + + trace_seq_init(&iter->seq); + + if (trace_remote_iter_print_event(iter)) { + seq_printf(m, "[EVENT %d PRINT TOO BIG]\n", iter->evt->id); + return 0; + } + + return trace_print_seq(m, &iter->seq); +} + +static void trace_stop(struct seq_file *m, void *v) +{ + struct trace_remote_iterator *iter = m->private; + + if (iter) + trace_remote_iter_read_finished(iter); +} + +static const struct seq_operations trace_sops = { + .start = trace_start, + .next = trace_next, + .show = trace_show, + .stop = trace_stop, +}; + +static int trace_open(struct inode *inode, struct file *filp) +{ + struct trace_remote *remote = inode->i_private; + struct trace_remote_iterator *iter = NULL; + int cpu = tracing_get_cpu(inode); + int ret; + + if (!(filp->f_mode & FMODE_READ)) + return 0; + + guard(mutex)(&remote->lock); + + iter = trace_remote_iter(remote, cpu, TRI_NONCONSUMING); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + ret = seq_open(filp, &trace_sops); + if (ret) { + trace_remote_iter_free(iter); + return ret; + } + + ((struct seq_file *)filp->private_data)->private = (void *)iter; + + return 0; +} + +static int trace_release(struct inode *inode, struct file *filp) +{ + struct trace_remote_iterator *iter; + + if (!(filp->f_mode & FMODE_READ)) + return 0; + + iter = ((struct seq_file *)filp->private_data)->private; + seq_release(inode, filp); + + if (!iter) + return 0; + + guard(mutex)(&iter->remote->lock); + + trace_remote_iter_free(iter); + + return 0; +} + +static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_remote *remote = inode->i_private; + int cpu = tracing_get_cpu(inode); + + guard(mutex)(&remote->lock); + + trace_remote_reset(remote, cpu); + + return cnt; +} + +static const struct file_operations trace_fops = { + .open = trace_open, + .write = trace_write, + .read = seq_read, + .read_iter = seq_read_iter, + .release = trace_release, +}; + +static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote) +{ + struct dentry *remote_d, *percpu_d, *d; + static struct dentry *root; + static DEFINE_MUTEX(lock); + bool root_inited = false; + int cpu; + + guard(mutex)(&lock); + + if (!root) { + root = tracefs_create_dir(TRACEFS_DIR, NULL); + if (!root) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n"); + return -ENOMEM; + } + root_inited = true; + } + + remote_d = tracefs_create_dir(name, root); + if (!remote_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name); + goto err; + } + + d = trace_create_file("tracing_on", TRACEFS_MODE_WRITE, remote_d, remote, &tracing_on_fops); + if (!d) + goto err; + + d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote, + &buffer_size_kb_fops); + if (!d) + goto err; + + d = trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, &trace_pipe_fops); + if (!d) + goto err; + + d = trace_create_file("trace", TRACEFS_MODE_WRITE, remote_d, remote, &trace_fops); + if (!d) + goto err; + + percpu_d = tracefs_create_dir("per_cpu", remote_d); + if (!percpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name); + goto err; + } + + for_each_possible_cpu(cpu) { + struct dentry *cpu_d; + char cpu_name[16]; + + snprintf(cpu_name, sizeof(cpu_name), "cpu%d", cpu); + cpu_d = tracefs_create_dir(cpu_name, percpu_d); + if (!cpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/percpu/cpu%d\n", + name, cpu); + goto err; + } + + d = trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu, + &trace_pipe_fops); + if (!d) + goto err; + + d = trace_create_cpu_file("trace", TRACEFS_MODE_WRITE, cpu_d, remote, cpu, + &trace_fops); + if (!d) + goto err; + } + + remote->dentry = remote_d; + + return 0; + +err: + if (root_inited) { + tracefs_remove(root); + root = NULL; + } else { + tracefs_remove(remote_d); + } + + return -ENOMEM; +} + +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events); + +/** + * trace_remote_register() - Register a Tracefs remote + * @name: Name of the remote, used for the Tracefs remotes/ directory. + * @cbs: Set of callbacks used to control the remote. + * @priv: Private data, passed to each callback from @cbs. + * @events: Array of events. &remote_event.name and &remote_event.id must be + * filled by the caller. + * @nr_events: Number of events in the @events array. + * + * A trace remote is an entity, outside of the kernel (most likely firmware or + * hypervisor) capable of writing events into a Tracefs compatible ring-buffer. + * The kernel would then act as a reader. + * + * The registered remote will be found under the Tracefs directory + * remotes/<name>. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv, + struct remote_event *events, size_t nr_events) +{ + struct trace_remote *remote; + int ret; + + remote = kzalloc_obj(*remote); + if (!remote) + return -ENOMEM; + + remote->cbs = cbs; + remote->priv = priv; + remote->trace_buffer_size = 7 << 10; + remote->poll_ms = 100; + mutex_init(&remote->lock); + init_rwsem(&remote->reader_lock); + + if (trace_remote_init_tracefs(name, remote)) { + kfree(remote); + return -ENOMEM; + } + + ret = trace_remote_register_events(name, remote, events, nr_events); + if (ret) { + pr_err("Failed to register events for trace remote '%s' (%d)\n", + name, ret); + return ret; + } + + ret = cbs->init ? cbs->init(remote->dentry, priv) : 0; + if (ret) + pr_err("Init failed for trace remote '%s' (%d)\n", name, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_remote_register); + +/** + * trace_remote_free_buffer() - Free trace buffer allocated with trace_remote_alloc_buffer() + * @desc: Descriptor of the per-CPU ring-buffers, originally filled by + * trace_remote_alloc_buffer() + * + * Most likely called from &trace_remote_callbacks.unload_trace_buffer. + */ +void trace_remote_free_buffer(struct trace_buffer_desc *desc) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) { + unsigned int id; + + free_page(rb_desc->meta_va); + + for (id = 0; id < rb_desc->nr_page_va; id++) + free_page(rb_desc->page_va[id]); + } +} +EXPORT_SYMBOL_GPL(trace_remote_free_buffer); + +/** + * trace_remote_alloc_buffer() - Dynamically allocate a trace buffer + * @desc: Uninitialized trace_buffer_desc + * @desc_size: Size of the trace_buffer_desc. Must be at least equal to + * trace_buffer_desc_size() + * @buffer_size: Size in bytes of each per-CPU ring-buffer + * @cpumask: CPUs to allocate a ring-buffer for + * + * Helper to dynamically allocate a set of pages (enough to cover @buffer_size) + * for each CPU from @cpumask and fill @desc. Most likely called from + * &trace_remote_callbacks.load_trace_buffer. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, + const struct cpumask *cpumask) +{ + unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1; + void *desc_end = desc + desc_size; + struct ring_buffer_desc *rb_desc; + int cpu, ret = -ENOMEM; + + if (desc_size < struct_size(desc, __data, 0)) + return -EINVAL; + + desc->nr_cpus = 0; + desc->struct_len = struct_size(desc, __data, 0); + + rb_desc = (struct ring_buffer_desc *)&desc->__data[0]; + + for_each_cpu(cpu, cpumask) { + unsigned int id; + + if ((void *)rb_desc + struct_size(rb_desc, page_va, nr_pages) > desc_end) { + ret = -EINVAL; + goto err; + } + + rb_desc->cpu = cpu; + rb_desc->nr_page_va = 0; + rb_desc->meta_va = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->meta_va) + goto err; + + for (id = 0; id < nr_pages; id++) { + rb_desc->page_va[id] = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->page_va[id]) + goto err; + + rb_desc->nr_page_va++; + } + desc->nr_cpus++; + desc->struct_len += offsetof(struct ring_buffer_desc, page_va); + desc->struct_len += struct_size(rb_desc, page_va, rb_desc->nr_page_va); + rb_desc = __next_ring_buffer_desc(rb_desc); + } + + return 0; + +err: + trace_remote_free_buffer(desc); + return ret; +} +EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer); + +static int +trace_remote_enable_event(struct trace_remote *remote, struct remote_event *evt, bool enable) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (evt->enabled == enable) + return 0; + + ret = remote->cbs->enable_event(evt->id, enable, remote->priv); + if (ret) + return ret; + + evt->enabled = enable; + + return 0; +} + +static int remote_event_enable_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->enabled); + + return 0; +} + +static ssize_t remote_event_enable_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct remote_event *evt = seq->private; + struct trace_remote *remote = evt->remote; + u8 enable; + int ret; + + ret = kstrtou8_from_user(ubuf, count, 10, &enable); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = trace_remote_enable_event(remote, evt, enable); + if (ret) + return ret; + + return count; +} +DEFINE_SHOW_STORE_ATTRIBUTE(remote_event_enable); + +static int remote_event_id_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->id); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_id); + +static int remote_event_format_show(struct seq_file *s, void *unused) +{ + size_t offset = sizeof(struct remote_event_hdr); + struct remote_event *evt = s->private; + struct trace_event_fields *field; + + seq_printf(s, "name: %s\n", evt->name); + seq_printf(s, "ID: %d\n", evt->id); + seq_puts(s, + "format:\n\tfield:unsigned short common_type;\toffset:0;\tsize:2;\tsigned:0;\n\n"); + + field = &evt->fields[0]; + while (field->name) { + seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, offset, field->size, + field->is_signed); + offset += field->size; + field++; + } + + if (field != &evt->fields[0]) + seq_puts(s, "\n"); + + seq_printf(s, "print fmt: %s\n", evt->print_fmt); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_format); + +static int remote_event_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (!strcmp(name, "enable")) { + *mode = TRACEFS_MODE_WRITE; + *fops = &remote_event_enable_fops; + return 1; + } + + if (!strcmp(name, "id")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_id_fops; + return 1; + } + + if (!strcmp(name, "format")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_format_fops; + return 1; + } + + return 0; +} + +static ssize_t remote_events_dir_enable_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct trace_remote *remote = file_inode(filp)->i_private; + int i, ret; + u8 enable; + + ret = kstrtou8_from_user(ubuf, count, 10, &enable); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + for (i = 0; i < remote->nr_events; i++) { + struct remote_event *evt = &remote->events[i]; + + trace_remote_enable_event(remote, evt, enable); + } + + return count; +} + +static ssize_t remote_events_dir_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_remote *remote = file_inode(filp)->i_private; + const char enabled_char[] = {'0', '1', 'X'}; + char enabled_str[] = " \n"; + int i, enabled = -1; + + guard(mutex)(&remote->lock); + + for (i = 0; i < remote->nr_events; i++) { + struct remote_event *evt = &remote->events[i]; + + if (enabled == -1) { + enabled = evt->enabled; + } else if (enabled != evt->enabled) { + enabled = 2; + break; + } + } + + enabled_str[0] = enabled_char[enabled == -1 ? 0 : enabled]; + + return simple_read_from_buffer(ubuf, cnt, ppos, enabled_str, 2); +} + +static const struct file_operations remote_events_dir_enable_fops = { + .write = remote_events_dir_enable_write, + .read = remote_events_dir_enable_read, +}; + +static ssize_t +remote_events_dir_header_page_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_seq *s; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + ring_buffer_print_page_header(NULL, s); + ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); + kfree(s); + + return ret; +} + +static const struct file_operations remote_events_dir_header_page_fops = { + .read = remote_events_dir_header_page_read, +}; + +static ssize_t +remote_events_dir_header_event_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_seq *s; + int ret; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + ring_buffer_print_entry_header(s); + ret = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, trace_seq_used(s)); + kfree(s); + + return ret; +} + +static const struct file_operations remote_events_dir_header_event_fops = { + .read = remote_events_dir_header_event_read, +}; + +static int remote_events_dir_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (!strcmp(name, "enable")) { + *mode = TRACEFS_MODE_WRITE; + *fops = &remote_events_dir_enable_fops; + return 1; + } + + if (!strcmp(name, "header_page")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_events_dir_header_page_fops; + return 1; + } + + if (!strcmp(name, "header_event")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_events_dir_header_event_fops; + return 1; + } + + return 0; +} + +static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote, + struct remote_event *evt) +{ + struct eventfs_inode *eventfs = remote->eventfs; + static struct eventfs_entry dir_entries[] = { + { + .name = "enable", + .callback = remote_events_dir_callback, + }, { + .name = "header_page", + .callback = remote_events_dir_callback, + }, { + .name = "header_event", + .callback = remote_events_dir_callback, + } + }; + static struct eventfs_entry entries[] = { + { + .name = "enable", + .callback = remote_event_callback, + }, { + .name = "id", + .callback = remote_event_callback, + }, { + .name = "format", + .callback = remote_event_callback, + } + }; + bool eventfs_create = false; + + if (!eventfs) { + eventfs = eventfs_create_events_dir("events", remote->dentry, dir_entries, + ARRAY_SIZE(dir_entries), remote); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + /* + * Create similar hierarchy as local events even if a single system is supported at + * the moment + */ + eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + remote->eventfs = eventfs; + eventfs_create = true; + } + + eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt); + if (IS_ERR(eventfs)) { + if (eventfs_create) { + eventfs_remove_events_dir(remote->eventfs); + remote->eventfs = NULL; + } + return PTR_ERR(eventfs); + } + + return 0; +} + +static int trace_remote_attach_events(struct trace_remote *remote, struct remote_event *events, + size_t nr_events) +{ + int i; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + if (evt->remote) + return -EEXIST; + + evt->remote = remote; + + /* We need events to be sorted for efficient lookup */ + if (i && evt->id <= events[i - 1].id) + return -EINVAL; + } + + remote->events = events; + remote->nr_events = nr_events; + + return 0; +} + +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events) +{ + int i, ret; + + ret = trace_remote_attach_events(remote, events, nr_events); + if (ret) + return ret; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + ret = trace_remote_init_eventfs(remote_name, remote, evt); + if (ret) + pr_warn("Failed to init eventfs for event '%s' (%d)", + evt->name, ret); + } + + return 0; +} + +static int __cmp_events(const void *key, const void *data) +{ + const struct remote_event *evt = data; + int id = (int)((long)key); + + return id - (int)evt->id; +} + +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id) +{ + return bsearch((const void *)(unsigned long)id, remote->events, remote->nr_events, + sizeof(*remote->events), __cmp_events); +} diff --git a/kernel/trace/trace_snapshot.c b/kernel/trace/trace_snapshot.c new file mode 100644 index 000000000000..07b43c9863a2 --- /dev/null +++ b/kernel/trace/trace_snapshot.c @@ -0,0 +1,1066 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/fsnotify.h> + +#include <asm/setup.h> /* COMMAND_LINE_SIZE */ + +#include "trace.h" + +/* Used if snapshot allocated at boot */ +static bool allocate_snapshot; +static bool snapshot_at_boot; + +static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; +static int boot_snapshot_index; + +static int __init boot_alloc_snapshot(char *str) +{ + char *slot = boot_snapshot_info + boot_snapshot_index; + int left = sizeof(boot_snapshot_info) - boot_snapshot_index; + int ret; + + if (str[0] == '=') { + str++; + if (strlen(str) >= left) + return -1; + + ret = snprintf(slot, left, "%s\t", str); + boot_snapshot_index += ret; + } else { + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + trace_set_ring_buffer_expanded(NULL); + } + return 1; +} +__setup("alloc_snapshot", boot_alloc_snapshot); + + +static int __init boot_snapshot(char *str) +{ + snapshot_at_boot = true; + boot_alloc_snapshot(str); + return 1; +} +__setup("ftrace_boot_snapshot", boot_snapshot); +static void tracing_snapshot_instance_cond(struct trace_array *tr, + void *cond_data) +{ + unsigned long flags; + + if (in_nmi()) { + trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); + trace_array_puts(tr, "*** snapshot is being ignored ***\n"); + return; + } + + if (!tr->allocated_snapshot) { + trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n"); + trace_array_puts(tr, "*** stopping trace here! ***\n"); + tracer_tracing_off(tr); + return; + } + + if (tr->mapped) { + trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); + return; + } + + /* Note, snapshot can not be used when the tracer uses it */ + if (tracer_uses_snapshot(tr->current_trace)) { + trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n"); + trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n"); + return; + } + + local_irq_save(flags); + update_max_tr(tr, current, smp_processor_id(), cond_data); + local_irq_restore(flags); +} + +void tracing_snapshot_instance(struct trace_array *tr) +{ + tracing_snapshot_instance_cond(tr, NULL); +} + +/** + * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. + * @tr: The tracing instance to snapshot + * @cond_data: The data to be tested conditionally, and possibly saved + * + * This is the same as tracing_snapshot() except that the snapshot is + * conditional - the snapshot will only happen if the + * cond_snapshot.update() implementation receiving the cond_data + * returns true, which means that the trace array's cond_snapshot + * update() operation used the cond_data to determine whether the + * snapshot should be taken, and if it was, presumably saved it along + * with the snapshot. + */ +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + tracing_snapshot_instance_cond(tr, cond_data); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); + +/** + * tracing_cond_snapshot_data - get the user data associated with a snapshot + * @tr: The tracing instance + * + * When the user enables a conditional snapshot using + * tracing_snapshot_cond_enable(), the user-defined cond_data is saved + * with the snapshot. This accessor is used to retrieve it. + * + * Should not be called from cond_snapshot.update(), since it takes + * the tr->max_lock lock, which the code calling + * cond_snapshot.update() has already done. + * + * Returns the cond_data associated with the trace array's snapshot. + */ +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + void *cond_data = NULL; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + + if (tr->cond_snapshot) + cond_data = tr->cond_snapshot->cond_data; + + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + return cond_data; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); + +/* resize @tr's buffer to the size of @size_tr's entries */ +int resize_buffer_duplicate_size(struct array_buffer *trace_buf, + struct array_buffer *size_buf, int cpu_id) +{ + int cpu, ret = 0; + + if (cpu_id == RING_BUFFER_ALL_CPUS) { + for_each_tracing_cpu(cpu) { + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu)->entries, cpu); + if (ret < 0) + break; + per_cpu_ptr(trace_buf->data, cpu)->entries = + per_cpu_ptr(size_buf->data, cpu)->entries; + } + } else { + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); + if (ret == 0) + per_cpu_ptr(trace_buf->data, cpu_id)->entries = + per_cpu_ptr(size_buf->data, cpu_id)->entries; + } + + return ret; +} + +int tracing_alloc_snapshot_instance(struct trace_array *tr) +{ + int order; + int ret; + + if (!tr->allocated_snapshot) { + + /* Make the snapshot buffer have the same order as main buffer */ + order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer); + ret = ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, order); + if (ret < 0) + return ret; + + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&tr->snapshot_buffer, + &tr->array_buffer, RING_BUFFER_ALL_CPUS); + if (ret < 0) + return ret; + + tr->allocated_snapshot = true; + } + + return 0; +} + +void free_snapshot(struct trace_array *tr) +{ + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_subbuf_order_set(tr->snapshot_buffer.buffer, 0); + ring_buffer_resize(tr->snapshot_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); + trace_set_buffer_entries(&tr->snapshot_buffer, 1); + tracing_reset_online_cpus(&tr->snapshot_buffer); + tr->allocated_snapshot = false; +} + +int tracing_arm_snapshot_locked(struct trace_array *tr) +{ + int ret; + + lockdep_assert_held(&trace_types_lock); + + spin_lock(&tr->snapshot_trigger_lock); + if (tr->snapshot == UINT_MAX || tr->mapped) { + spin_unlock(&tr->snapshot_trigger_lock); + return -EBUSY; + } + + tr->snapshot++; + spin_unlock(&tr->snapshot_trigger_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) { + spin_lock(&tr->snapshot_trigger_lock); + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); + } + + return ret; +} + +int tracing_arm_snapshot(struct trace_array *tr) +{ + guard(mutex)(&trace_types_lock); + return tracing_arm_snapshot_locked(tr); +} + +void tracing_disarm_snapshot(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->snapshot)) + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); +} + +/** + * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to tracing_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ + int ret; + + ret = tracing_alloc_snapshot(); + if (ret < 0) + return; + + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); + +/** + * tracing_snapshot_cond_enable - enable conditional snapshot for an instance + * @tr: The tracing instance + * @cond_data: User data to associate with the snapshot + * @update: Implementation of the cond_snapshot update function + * + * Check whether the conditional snapshot for the given instance has + * already been enabled, or if the current tracer is already using a + * snapshot; if so, return -EBUSY, else create a cond_snapshot and + * save the cond_data and update function inside. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, + cond_update_fn_t update) +{ + struct cond_snapshot *cond_snapshot __free(kfree) = + kzalloc_obj(*cond_snapshot); + int ret; + + if (!cond_snapshot) + return -ENOMEM; + + cond_snapshot->cond_data = cond_data; + cond_snapshot->update = update; + + guard(mutex)(&trace_types_lock); + + if (tracer_uses_snapshot(tr->current_trace)) + return -EBUSY; + + /* + * The cond_snapshot can only change to NULL without the + * trace_types_lock. We don't care if we race with it going + * to NULL, but we want to make sure that it's not set to + * something other than NULL when we get here, which we can + * do safely with only holding the trace_types_lock and not + * having to take the max_lock. + */ + if (tr->cond_snapshot) + return -EBUSY; + + ret = tracing_arm_snapshot_locked(tr); + if (ret) + return ret; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + tr->cond_snapshot = no_free_ptr(cond_snapshot); + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + return 0; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); + +/** + * tracing_snapshot_cond_disable - disable conditional snapshot for an instance + * @tr: The tracing instance + * + * Check whether the conditional snapshot for the given instance is + * enabled; if so, free the cond_snapshot associated with it, + * otherwise return -EINVAL. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + int ret = 0; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + + if (!tr->cond_snapshot) + ret = -EINVAL; + else { + kfree(tr->cond_snapshot); + tr->cond_snapshot = NULL; + } + + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + + tracing_disarm_snapshot(tr); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); + +#ifdef CONFIG_TRACER_MAX_TRACE +#ifdef LATENCY_FS_NOTIFY +static struct workqueue_struct *fsnotify_wq; + +static void latency_fsnotify_workfn(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, + fsnotify_work); + fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY); +} + +static void latency_fsnotify_workfn_irq(struct irq_work *iwork) +{ + struct trace_array *tr = container_of(iwork, struct trace_array, + fsnotify_irqwork); + queue_work(fsnotify_wq, &tr->fsnotify_work); +} + +__init static int latency_fsnotify_init(void) +{ + fsnotify_wq = alloc_workqueue("tr_max_lat_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!fsnotify_wq) { + pr_err("Unable to allocate tr_max_lat_wq\n"); + return -ENOMEM; + } + return 0; +} + +late_initcall_sync(latency_fsnotify_init); + +void latency_fsnotify(struct trace_array *tr) +{ + if (!fsnotify_wq) + return; + /* + * We cannot call queue_work(&tr->fsnotify_work) from here because it's + * possible that we are called from __schedule() or do_idle(), which + * could cause a deadlock. + */ + irq_work_queue(&tr->fsnotify_irqwork); +} +#endif /* LATENCY_FS_NOTIFY */ + +static const struct file_operations tracing_max_lat_fops; + +void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ +#ifdef LATENCY_FS_NOTIFY + INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); + init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); +#endif + tr->d_max_latency = trace_create_file("tracing_max_latency", + TRACE_MODE_WRITE, + d_tracer, tr, + &tracing_max_lat_fops); +} + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /sys/kernel/tracing/tracing_max_latency) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct array_buffer *trace_buf = &tr->array_buffer; + struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); + struct array_buffer *max_buf = &tr->snapshot_buffer; + struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); + + max_buf->cpu = cpu; + max_buf->time_start = data->preempt_timestamp; + + max_data->saved_latency = tr->max_latency; + max_data->critical_start = data->critical_start; + max_data->critical_end = data->critical_end; + + strscpy(max_data->comm, tsk->comm); + max_data->pid = tsk->pid; + /* + * If tsk == current, then use current_uid(), as that does not use + * RCU. The irq tracer can be called out of RCU scope. + */ + if (tsk == current) + max_data->uid = current_uid(); + else + max_data->uid = task_uid(tsk); + + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + max_data->policy = tsk->policy; + max_data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(tsk); + latency_fsnotify(tr); +} +#else +static inline void __update_max_tr(struct trace_array *tr, + struct task_struct *tsk, int cpu) { } +#endif /* CONFIG_TRACER_MAX_TRACE */ + +/** + * update_max_tr - snapshot all trace buffers from global_trace to max_tr + * @tr: tracer + * @tsk: the task with the latency + * @cpu: The cpu that initiated the trace. + * @cond_data: User data associated with a conditional snapshot + * + * Flip the buffers between the @tr and the max_tr and record information + * about which task was the cause of this latency. + */ +void +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data) +{ + if (tr->stop_count) + return; + + WARN_ON_ONCE(!irqs_disabled()); + + if (!tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); + return; + } + + arch_spin_lock(&tr->max_lock); + + /* Inherit the recordable setting from array_buffer */ + if (ring_buffer_record_is_set_on(tr->array_buffer.buffer)) + ring_buffer_record_on(tr->snapshot_buffer.buffer); + else + ring_buffer_record_off(tr->snapshot_buffer.buffer); + + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) { + arch_spin_unlock(&tr->max_lock); + return; + } + + swap(tr->array_buffer.buffer, tr->snapshot_buffer.buffer); + + __update_max_tr(tr, tsk, cpu); + + arch_spin_unlock(&tr->max_lock); + + /* Any waiters on the old snapshot buffer need to wake up */ + ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); +} + +/** + * update_max_tr_single - only copy one trace over, and reset the rest + * @tr: tracer + * @tsk: task with the latency + * @cpu: the cpu of the buffer to copy. + * + * Flip the trace of a single CPU buffer between the @tr and the max_tr. + */ +void +update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + int ret; + + if (tr->stop_count) + return; + + WARN_ON_ONCE(!irqs_disabled()); + if (!tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); + return; + } + + arch_spin_lock(&tr->max_lock); + + ret = ring_buffer_swap_cpu(tr->snapshot_buffer.buffer, tr->array_buffer.buffer, cpu); + + if (ret == -EBUSY) { + /* + * We failed to swap the buffer due to a commit taking + * place on this CPU. We fail to record, but we reset + * the max trace buffer (no one writes directly to it) + * and flag that it failed. + * Another reason is resize is in progress. + */ + trace_array_printk_buf(tr->snapshot_buffer.buffer, _THIS_IP_, + "Failed to swap buffers due to commit or resize in progress\n"); + } + + WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); + + __update_max_tr(tr, tsk, cpu); + arch_spin_unlock(&tr->max_lock); +} + +static void show_snapshot_main_help(struct seq_file *m) +{ + seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" + "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer.\n" + "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); +} + +static void show_snapshot_percpu_help(struct seq_file *m) +{ + seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer for this cpu.\n"); +#else + seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" + "# Must use main snapshot file to allocate.\n"); +#endif + seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); +} + +void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + if (iter->tr->allocated_snapshot) + seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); + else + seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); + + seq_puts(m, "# Snapshot commands:\n"); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + show_snapshot_main_help(m); + else + show_snapshot_percpu_help(m); +} + +static int tracing_snapshot_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + struct trace_iterator *iter; + struct seq_file *m; + int ret; + + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file, true); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + } else { + /* Writes still need the seq_file to hold the private data */ + ret = -ENOMEM; + m = kzalloc_obj(*m); + if (!m) + goto out; + iter = kzalloc_obj(*iter); + if (!iter) { + kfree(m); + goto out; + } + ret = 0; + + iter->tr = tr; + iter->array_buffer = &tr->snapshot_buffer; + iter->cpu_file = tracing_get_cpu(inode); + m->private = iter; + file->private_data = m; + } +out: + if (ret < 0) + trace_array_put(tr); + + return ret; +} + +static void tracing_swap_cpu_buffer(void *tr) +{ + update_max_tr_single((struct trace_array *)tr, current, smp_processor_id()); +} + +static ssize_t +tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct seq_file *m = filp->private_data; + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; + unsigned long val; + int ret; + + ret = tracing_update_buffers(tr); + if (ret < 0) + return ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&trace_types_lock); + + if (tracer_uses_snapshot(tr->current_trace)) + return -EBUSY; + + local_irq_disable(); + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + local_irq_enable(); + if (ret) + return ret; + + switch (val) { + case 0: + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; + if (tr->allocated_snapshot) + free_snapshot(tr); + break; + case 1: +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; +#endif + if (tr->allocated_snapshot) + ret = resize_buffer_duplicate_size(&tr->snapshot_buffer, + &tr->array_buffer, iter->cpu_file); + + ret = tracing_arm_snapshot_locked(tr); + if (ret) + return ret; + + /* Now, we're going to swap */ + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { + local_irq_disable(); + update_max_tr(tr, current, smp_processor_id(), NULL); + local_irq_enable(); + } else { + smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, + (void *)tr, 1); + } + tracing_disarm_snapshot(tr); + break; + default: + if (tr->allocated_snapshot) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->snapshot_buffer); + else + tracing_reset_cpu(&tr->snapshot_buffer, iter->cpu_file); + } + break; + } + + if (ret >= 0) { + *ppos += cnt; + ret = cnt; + } + + return ret; +} + +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + int ret; + + ret = tracing_release(inode, file); + + if (file->f_mode & FMODE_READ) + return ret; + + /* If write only, the seq_file is just a stub */ + if (m) + kfree(m->private); + kfree(m); + + return 0; +} + +static int snapshot_raw_open(struct inode *inode, struct file *filp) +{ + struct ftrace_buffer_info *info; + int ret; + + /* The following checks for tracefs lockdown */ + ret = tracing_buffers_open(inode, filp); + if (ret < 0) + return ret; + + info = filp->private_data; + + if (tracer_uses_snapshot(info->iter.trace)) { + tracing_buffers_release(inode, filp); + return -EBUSY; + } + + info->iter.snapshot = true; + info->iter.array_buffer = &info->iter.tr->snapshot_buffer; + + return ret; +} + +const struct file_operations snapshot_fops = { + .open = tracing_snapshot_open, + .read = seq_read, + .write = tracing_snapshot_write, + .llseek = tracing_lseek, + .release = tracing_snapshot_release, +}; + +const struct file_operations snapshot_raw_fops = { + .open = snapshot_raw_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, +}; + +#ifdef CONFIG_TRACER_MAX_TRACE +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_read(&tr->max_latency, ubuf, cnt, ppos); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + + return tracing_nsecs_write(&tr->max_latency, ubuf, cnt, ppos); +} + +static const struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic_tr, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; +#endif /* CONFIG_TRACER_MAX_TRACE */ + +int get_snapshot_map(struct trace_array *tr) +{ + int err = 0; + + /* + * Called with mmap_lock held. lockdep would be unhappy if we would now + * take trace_types_lock. Instead use the specific + * snapshot_trigger_lock. + */ + spin_lock(&tr->snapshot_trigger_lock); + + if (tr->snapshot || tr->mapped == UINT_MAX) + err = -EBUSY; + else + tr->mapped++; + + spin_unlock(&tr->snapshot_trigger_lock); + + /* Wait for update_max_tr() to observe iter->tr->mapped */ + if (tr->mapped == 1) + synchronize_rcu(); + + return err; + +} + +void put_snapshot_map(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->mapped)) + tr->mapped--; + spin_unlock(&tr->snapshot_trigger_lock); +} + +#ifdef CONFIG_DYNAMIC_FTRACE +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + tracing_snapshot_instance(tr); +} + +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + long *count = NULL; + + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) { + + if (*count <= 0) + return; + + (*count)--; + } + + tracing_snapshot_instance(tr); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + struct ftrace_func_mapper *mapper = data; + long *count = NULL; + + seq_printf(m, "%ps:", (void *)ip); + + seq_puts(m, "snapshot"); + + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); + else + seq_puts(m, ":unlimited\n"); + + return 0; +} + +static int +ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); +} + +static void +ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, NULL); + return; + } + + ftrace_func_mapper_remove_ip(mapper, ip); +} + +static struct ftrace_probe_ops snapshot_probe_ops = { + .func = ftrace_snapshot, + .print = ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { + .func = ftrace_count_snapshot, + .print = ftrace_snapshot_print, + .init = ftrace_snapshot_init, + .free = ftrace_snapshot_free, +}; + +static int +ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + void *count = (void *)-1; + char *number; + int ret; + + if (!tr) + return -ENODEV; + + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; + + if (glob[0] == '!') { + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); + if (!ret) + tracing_disarm_snapshot(tr); + + return ret; + } + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = tracing_arm_snapshot(tr); + if (ret < 0) + return ret; + + ret = register_ftrace_function_probe(glob, tr, ops, count); + if (ret < 0) + tracing_disarm_snapshot(tr); + + return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { + .name = "snapshot", + .func = ftrace_trace_snapshot_callback, +}; + +__init int register_snapshot_cmd(void) +{ + return register_ftrace_command(&ftrace_snapshot_cmd); +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +int trace_allocate_snapshot(struct trace_array *tr, int size) +{ + int ret; + + /* Fix mapped buffer trace arrays do not have snapshot buffers */ + if (tr->range_addr_start) + return 0; + + /* allocate_snapshot can only be true during system boot */ + ret = allocate_trace_buffer(tr, &tr->snapshot_buffer, + allocate_snapshot ? size : 1); + if (ret < 0) + return -ENOMEM; + + tr->allocated_snapshot = allocate_snapshot; + + allocate_snapshot = false; + return 0; +} + +__init static bool tr_needs_alloc_snapshot(const char *name) +{ + char *test; + int len = strlen(name); + bool ret; + + if (!boot_snapshot_index) + return false; + + if (strncmp(name, boot_snapshot_info, len) == 0 && + boot_snapshot_info[len] == '\t') + return true; + + test = kmalloc(strlen(name) + 3, GFP_KERNEL); + if (!test) + return false; + + sprintf(test, "\t%s\t", name); + ret = strstr(boot_snapshot_info, test) == NULL; + kfree(test); + return ret; +} + +__init void do_allocate_snapshot(const char *name) +{ + if (!tr_needs_alloc_snapshot(name)) + return; + + /* + * When allocate_snapshot is set, the next call to + * allocate_trace_buffers() (called by trace_array_get_by_name()) + * will allocate the snapshot buffer. That will also clear + * this flag. + */ + allocate_snapshot = true; +} + +void __init ftrace_boot_snapshot(void) +{ + struct trace_array *tr; + + if (!snapshot_at_boot) + return; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->allocated_snapshot) + continue; + + tracing_snapshot_instance(tr); + trace_array_puts(tr, "** Boot snapshot taken **\n"); + } +} diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 37317b81fcda..8ad72e17d8eb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -174,7 +174,6 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat { O_NOFOLLOW, "O_NOFOLLOW" }, { O_NOATIME, "O_NOATIME" }, { O_CLOEXEC, "O_CLOEXEC" }, - { -1, NULL } }; trace_seq_printf(s, "%s(", entry->name); @@ -205,7 +204,7 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat trace_seq_puts(s, "O_RDONLY|"); } - trace_print_flags_seq(s, "|", bits, __flags); + trace_print_flags_seq(s, "|", bits, __flags, ARRAY_SIZE(__flags)); /* * trace_print_flags_seq() adds a '\0' to the * buffer, but this needs to append more to the seq. diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 91905aa19294..dffef52a807b 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -300,6 +300,8 @@ static int tracepoint_add_func(struct tracepoint *tp, lockdep_is_held(&tracepoints_mutex)); old = func_add(&tp_funcs, func, prio); if (IS_ERR(old)) { + if (tp->ext && tp->ext->unregfunc && !static_key_enabled(&tp->key)) + tp->ext->unregfunc(); WARN_ON_ONCE(warn && PTR_ERR(old) != -ENOMEM); return PTR_ERR(old); } diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 8d82913223a1..8614430ca212 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -18,8 +18,6 @@ #include <asm/page.h> #include <asm/sections.h> -#include <crypto/sha1.h> - #include "kallsyms_internal.h" #include "kexec_internal.h" @@ -198,7 +196,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_head); + VMCOREINFO_OFFSET(page, compound_info); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); #ifdef CONFIG_FLATMEM diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7d675781bc91..87dd5e0f6968 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -61,6 +61,13 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; # endif /* CONFIG_SMP */ /* + * Number of consecutive missed interrupts before declaring a lockup. + * Default to 1 (immediate) for NMI/Perf. Buddy will overwrite this to 3. + */ +int __read_mostly watchdog_hardlockup_miss_thresh = 1; +EXPORT_SYMBOL_GPL(watchdog_hardlockup_miss_thresh); + +/* * Should we panic when a soft-lockup or hard-lockup occurs: */ unsigned int __read_mostly hardlockup_panic = @@ -137,6 +144,7 @@ __setup("nmi_watchdog=", hardlockup_panic_setup); static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); +static DEFINE_PER_CPU(int, hrtimer_interrupts_missed); static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched); static unsigned long hard_lockup_nmi_warn; @@ -159,21 +167,33 @@ void watchdog_hardlockup_touch_cpu(unsigned int cpu) per_cpu(watchdog_hardlockup_touched, cpu) = true; } -static bool is_hardlockup(unsigned int cpu) +static void watchdog_hardlockup_update_reset(unsigned int cpu) { int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu)); - if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) - return true; - /* * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is * written/read by a single CPU. */ per_cpu(hrtimer_interrupts_saved, cpu) = hrint; + per_cpu(hrtimer_interrupts_missed, cpu) = 0; +} + +static bool is_hardlockup(unsigned int cpu) +{ + int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu)); - return false; + if (per_cpu(hrtimer_interrupts_saved, cpu) != hrint) { + watchdog_hardlockup_update_reset(cpu); + return false; + } + + per_cpu(hrtimer_interrupts_missed, cpu)++; + if (per_cpu(hrtimer_interrupts_missed, cpu) % watchdog_hardlockup_miss_thresh) + return false; + + return true; } static void watchdog_hardlockup_kick(void) @@ -187,8 +207,11 @@ static void watchdog_hardlockup_kick(void) void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) { int hardlockup_all_cpu_backtrace; + unsigned int this_cpu; + unsigned long flags; if (per_cpu(watchdog_hardlockup_touched, cpu)) { + watchdog_hardlockup_update_reset(cpu); per_cpu(watchdog_hardlockup_touched, cpu) = false; return; } @@ -201,74 +224,73 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) * fired multiple times before we overflow'd. If it hasn't * then this is a good indication the cpu is stuck */ - if (is_hardlockup(cpu)) { - unsigned int this_cpu = smp_processor_id(); - unsigned long flags; + if (!is_hardlockup(cpu)) { + per_cpu(watchdog_hardlockup_warned, cpu) = false; + return; + } #ifdef CONFIG_SYSFS - ++hardlockup_count; + ++hardlockup_count; #endif - /* - * A poorly behaving BPF scheduler can trigger hard lockup by - * e.g. putting numerous affinitized tasks in a single queue and - * directing all CPUs at it. The following call can return true - * only once when sched_ext is enabled and will immediately - * abort the BPF scheduler and print out a warning message. - */ - if (scx_hardlockup(cpu)) - return; + /* + * A poorly behaving BPF scheduler can trigger hard lockup by + * e.g. putting numerous affinitized tasks in a single queue and + * directing all CPUs at it. The following call can return true + * only once when sched_ext is enabled and will immediately + * abort the BPF scheduler and print out a warning message. + */ + if (scx_hardlockup(cpu)) + return; - /* Only print hardlockups once. */ - if (per_cpu(watchdog_hardlockup_warned, cpu)) - return; + /* Only print hardlockups once. */ + if (per_cpu(watchdog_hardlockup_warned, cpu)) + return; - /* - * Prevent multiple hard-lockup reports if one cpu is already - * engaged in dumping all cpu back traces. - */ - if (hardlockup_all_cpu_backtrace) { - if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) - return; - } + /* + * Prevent multiple hard-lockup reports if one cpu is already + * engaged in dumping all cpu back traces. + */ + if (hardlockup_all_cpu_backtrace) { + if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) + return; + } - /* - * NOTE: we call printk_cpu_sync_get_irqsave() after printing - * the lockup message. While it would be nice to serialize - * that printout, we really want to make sure that if some - * other CPU somehow locked up while holding the lock associated - * with printk_cpu_sync_get_irqsave() that we can still at least - * get the message about the lockup out. - */ - pr_emerg("CPU%u: Watchdog detected hard LOCKUP on cpu %u\n", this_cpu, cpu); - printk_cpu_sync_get_irqsave(flags); + /* + * NOTE: we call printk_cpu_sync_get_irqsave() after printing + * the lockup message. While it would be nice to serialize + * that printout, we really want to make sure that if some + * other CPU somehow locked up while holding the lock associated + * with printk_cpu_sync_get_irqsave() that we can still at least + * get the message about the lockup out. + */ + this_cpu = smp_processor_id(); + pr_emerg("CPU%u: Watchdog detected hard LOCKUP on cpu %u\n", this_cpu, cpu); + printk_cpu_sync_get_irqsave(flags); - print_modules(); - print_irqtrace_events(current); - if (cpu == this_cpu) { - if (regs) - show_regs(regs); - else - dump_stack(); - printk_cpu_sync_put_irqrestore(flags); - } else { - printk_cpu_sync_put_irqrestore(flags); - trigger_single_cpu_backtrace(cpu); - } + print_modules(); + print_irqtrace_events(current); + if (cpu == this_cpu) { + if (regs) + show_regs(regs); + else + dump_stack(); + printk_cpu_sync_put_irqrestore(flags); + } else { + printk_cpu_sync_put_irqrestore(flags); + trigger_single_cpu_backtrace(cpu); + } - if (hardlockup_all_cpu_backtrace) { - trigger_allbutcpu_cpu_backtrace(cpu); - if (!hardlockup_panic) - clear_bit_unlock(0, &hard_lockup_nmi_warn); - } + if (hardlockup_all_cpu_backtrace) { + trigger_allbutcpu_cpu_backtrace(cpu); + if (!hardlockup_panic) + clear_bit_unlock(0, &hard_lockup_nmi_warn); + } - sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); + sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); - per_cpu(watchdog_hardlockup_warned, cpu) = true; - } else { - per_cpu(watchdog_hardlockup_warned, cpu) = false; - } + per_cpu(watchdog_hardlockup_warned, cpu) = true; } #else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c index ee754d767c21..3a1e57080c1c 100644 --- a/kernel/watchdog_buddy.c +++ b/kernel/watchdog_buddy.c @@ -21,6 +21,7 @@ static unsigned int watchdog_next_cpu(unsigned int cpu) int __init watchdog_hardlockup_probe(void) { + watchdog_hardlockup_miss_thresh = 3; return 0; } @@ -86,14 +87,6 @@ void watchdog_buddy_check_hardlockup(int hrtimer_interrupts) { unsigned int next_cpu; - /* - * Test for hardlockups every 3 samples. The sample period is - * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over - * watchdog_thresh (over by 20%). - */ - if (hrtimer_interrupts % 3 != 0) - return; - /* check for a hardlockup on the next CPU */ next_cpu = watchdog_next_cpu(smp_processor_id()); if (next_cpu >= nr_cpu_ids) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index aeaec79bc09c..5f747f241a5f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,6 +41,7 @@ #include <linux/mempolicy.h> #include <linux/freezer.h> #include <linux/debug_locks.h> +#include <linux/device/devres.h> #include <linux/lockdep.h> #include <linux/idr.h> #include <linux/jhash.h> @@ -130,6 +131,14 @@ enum wq_internal_consts { WORKER_ID_LEN = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */ }; +/* Layout of shards within one LLC pod */ +struct llc_shard_layout { + int nr_large_shards; /* number of large shards (cores_per_shard + 1) */ + int cores_per_shard; /* base number of cores per default shard */ + int nr_shards; /* total number of shards */ + /* nr_default shards = (nr_shards - nr_large_shards) */ +}; + /* * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because @@ -190,7 +199,7 @@ struct worker_pool { int id; /* I: pool ID */ unsigned int flags; /* L: flags */ - unsigned long watchdog_ts; /* L: watchdog timestamp */ + unsigned long last_progress_ts; /* L: last forward progress timestamp */ bool cpu_stall; /* WD: stalled cpu bound pool */ /* @@ -404,11 +413,12 @@ struct work_offq_data { u32 flags; }; -static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { +static const char * const wq_affn_names[WQ_AFFN_NR_TYPES] = { [WQ_AFFN_DFL] = "default", [WQ_AFFN_CPU] = "cpu", [WQ_AFFN_SMT] = "smt", [WQ_AFFN_CACHE] = "cache", + [WQ_AFFN_CACHE_SHARD] = "cache_shard", [WQ_AFFN_NUMA] = "numa", [WQ_AFFN_SYSTEM] = "system", }; @@ -431,13 +441,16 @@ module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); +static unsigned int wq_cache_shard_size = 8; +module_param_named(cache_shard_size, wq_cache_shard_size, uint, 0444); + static bool wq_online; /* can kworkers be created yet? */ static bool wq_topo_initialized __read_mostly = false; static struct kmem_cache *pwq_cache; static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; -static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; +static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE_SHARD; /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf; @@ -530,6 +543,8 @@ struct workqueue_struct *system_bh_wq; EXPORT_SYMBOL_GPL(system_bh_wq); struct workqueue_struct *system_bh_highpri_wq; EXPORT_SYMBOL_GPL(system_bh_highpri_wq); +struct workqueue_struct *system_dfl_long_wq __ro_after_init; +EXPORT_SYMBOL_GPL(system_dfl_long_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); @@ -1697,7 +1712,7 @@ static void __pwq_activate_work(struct pool_workqueue *pwq, WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE)); trace_workqueue_activate_work(work); if (list_empty(&pwq->pool->worklist)) - pwq->pool->watchdog_ts = jiffies; + pwq->pool->last_progress_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb); } @@ -1849,8 +1864,20 @@ static void unplug_oldest_pwq(struct workqueue_struct *wq) raw_spin_lock_irq(&pwq->pool->lock); if (pwq->plugged) { pwq->plugged = false; - if (pwq_activate_first_inactive(pwq, true)) + if (pwq_activate_first_inactive(pwq, true)) { + /* + * While plugged, queueing skips activation which + * includes bumping the nr_active count and adding the + * pwq to nna->pending_pwqs if the count can't be + * obtained. We need to restore both for the pwq being + * unplugged. The first call activates the first + * inactive work item and the second, if there are more + * inactive, puts the pwq on pending_pwqs. + */ + pwq_activate_first_inactive(pwq, false); + kick_pool(pwq->pool); + } } raw_spin_unlock_irq(&pwq->pool->lock); } @@ -2348,7 +2375,7 @@ retry: */ if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) { if (list_empty(&pool->worklist)) - pool->watchdog_ts = jiffies; + pool->last_progress_ts = jiffies; trace_workqueue_activate_work(work); insert_work(pwq, work, &pool->worklist, work_flags); @@ -2507,7 +2534,6 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - WARN_ON_ONCE(!wq); WARN_ON_ONCE(timer->function != delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); @@ -3204,6 +3230,7 @@ __acquires(&pool->lock) worker->current_pwq = pwq; if (worker->task) worker->current_at = worker->task->se.sum_exec_runtime; + worker->current_start = jiffies; work_data = *work_data_bits(work); worker->current_color = get_work_color(work_data); @@ -3352,7 +3379,7 @@ static void process_scheduled_works(struct worker *worker) while ((work = list_first_entry_or_null(&worker->scheduled, struct work_struct, entry))) { if (first) { - worker->pool->watchdog_ts = jiffies; + worker->pool->last_progress_ts = jiffies; first = false; } process_one_work(worker, work); @@ -4850,7 +4877,7 @@ static int init_worker_pool(struct worker_pool *pool) pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; - pool->watchdog_ts = jiffies; + pool->last_progress_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); @@ -5622,8 +5649,16 @@ enomem: for_each_possible_cpu(cpu) { struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); - if (pwq) + if (pwq) { + /* + * Unlink pwq from wq->pwqs since link_pwq() + * may have already added it. wq->mutex is not + * needed as the wq has not been published yet. + */ + if (!list_empty(&pwq->pwqs_node)) + list_del_rcu(&pwq->pwqs_node); kmem_cache_free(pwq_cache, pwq); + } } free_percpu(wq->cpu_pwq); wq->cpu_pwq = NULL; @@ -5891,6 +5926,33 @@ struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, } EXPORT_SYMBOL_GPL(alloc_workqueue_noprof); +static void devm_workqueue_release(void *res) +{ + destroy_workqueue(res); +} + +__printf(2, 5) struct workqueue_struct * +devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, + int max_active, ...) +{ + struct workqueue_struct *wq; + va_list args; + int ret; + + va_start(args, max_active); + wq = alloc_workqueue(fmt, flags, max_active, args); + va_end(args); + if (!wq) + return NULL; + + ret = devm_add_action_or_reset(dev, devm_workqueue_release, wq); + if (ret) + return NULL; + + return wq; +} +EXPORT_SYMBOL_GPL(devm_alloc_workqueue); + #ifdef CONFIG_LOCKDEP __printf(1, 5) struct workqueue_struct * @@ -6274,7 +6336,7 @@ static void pr_cont_worker_id(struct worker *worker) { struct worker_pool *pool = worker->pool; - if (pool->flags & WQ_BH) + if (pool->flags & POOL_BH) pr_cont("bh%s", pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : ""); else @@ -6359,6 +6421,8 @@ static void show_pwq(struct pool_workqueue *pwq) pr_cont(" %s", comma ? "," : ""); pr_cont_worker_id(worker); pr_cont(":%ps", worker->current_func); + pr_cont(" for %us", + jiffies_to_msecs(jiffies - worker->current_start) / 1000); list_for_each_entry(work, &worker->scheduled, entry) pr_cont_work(false, work, &pcws); pr_cont_work_flush(comma, (work_func_t)-1L, &pcws); @@ -6462,7 +6526,7 @@ static void show_one_worker_pool(struct worker_pool *pool) /* How long the first pending work is waiting for a worker. */ if (!list_empty(&pool->worklist)) - hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000; + hung = jiffies_to_msecs(jiffies - pool->last_progress_ts) / 1000; /* * Defer printing to avoid deadlocks in console drivers that @@ -7044,7 +7108,7 @@ int workqueue_unbound_housekeeping_update(const struct cpumask *hk) /* * If the operation fails, it will fall back to * wq_requested_unbound_cpumask which is initially set to - * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten + * HK_TYPE_DOMAIN house keeping mask and rewritten * by any subsequent write to workqueue/cpumask sysfs file. */ if (!cpumask_and(cpumask, wq_requested_unbound_cpumask, hk)) @@ -7063,13 +7127,7 @@ int workqueue_unbound_housekeeping_update(const struct cpumask *hk) static int parse_affn_scope(const char *val) { - int i; - - for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) { - if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i]))) - return i; - } - return -EINVAL; + return sysfs_match_string(wq_affn_names, val); } static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) @@ -7176,7 +7234,26 @@ static struct attribute *wq_sysfs_attrs[] = { &dev_attr_max_active.attr, NULL, }; -ATTRIBUTE_GROUPS(wq_sysfs); + +static umode_t wq_sysfs_is_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct workqueue_struct *wq = dev_to_wq(dev); + + /* + * Adjusting max_active breaks ordering guarantee. Changing it has no + * effect on BH worker. Limit max_active to RO in such case. + */ + if (wq->flags & (WQ_BH | __WQ_ORDERED)) + return 0444; + return a->mode; +} + +static const struct attribute_group wq_sysfs_group = { + .is_visible = wq_sysfs_is_visible, + .attrs = wq_sysfs_attrs, +}; +__ATTRIBUTE_GROUPS(wq_sysfs); static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -7479,13 +7556,6 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) struct wq_device *wq_dev; int ret; - /* - * Adjusting max_active breaks ordering guarantee. Disallow exposing - * ordered workqueues. - */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) - return -EINVAL; - wq->wq_dev = wq_dev = kzalloc_obj(*wq_dev); if (!wq_dev) return -ENOMEM; @@ -7580,11 +7650,11 @@ MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds /* * Show workers that might prevent the processing of pending work items. - * The only candidates are CPU-bound workers in the running state. - * Pending work items should be handled by another idle worker - * in all other situations. + * A busy worker that is not running on the CPU (e.g. sleeping in + * wait_event_idle() with PF_WQ_WORKER cleared) can stall the pool just as + * effectively as a CPU-bound one, so dump every in-flight worker. */ -static void show_cpu_pool_hog(struct worker_pool *pool) +static void show_cpu_pool_busy_workers(struct worker_pool *pool) { struct worker *worker; unsigned long irq_flags; @@ -7593,36 +7663,34 @@ static void show_cpu_pool_hog(struct worker_pool *pool) raw_spin_lock_irqsave(&pool->lock, irq_flags); hash_for_each(pool->busy_hash, bkt, worker, hentry) { - if (task_is_running(worker->task)) { - /* - * Defer printing to avoid deadlocks in console - * drivers that queue work while holding locks - * also taken in their write paths. - */ - printk_deferred_enter(); + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_deferred_enter(); - pr_info("pool %d:\n", pool->id); - sched_show_task(worker->task); + pr_info("pool %d:\n", pool->id); + sched_show_task(worker->task); - printk_deferred_exit(); - } + printk_deferred_exit(); } raw_spin_unlock_irqrestore(&pool->lock, irq_flags); } -static void show_cpu_pools_hogs(void) +static void show_cpu_pools_busy_workers(void) { struct worker_pool *pool; int pi; - pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n"); + pr_info("Showing backtraces of busy workers in stalled worker pools:\n"); rcu_read_lock(); for_each_pool(pool, pi) { if (pool->cpu_stall) - show_cpu_pool_hog(pool); + show_cpu_pool_busy_workers(pool); } @@ -7691,15 +7759,36 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); else touched = READ_ONCE(wq_watchdog_touched); - pool_ts = READ_ONCE(pool->watchdog_ts); + pool_ts = READ_ONCE(pool->last_progress_ts); if (time_after(pool_ts, touched)) ts = pool_ts; else ts = touched; - /* did we stall? */ + /* + * Did we stall? + * + * Do a lockless check first to do not disturb the system. + * + * Prevent false positives by double checking the timestamp + * under pool->lock. The lock makes sure that the check reads + * an updated pool->last_progress_ts when this CPU saw + * an already updated pool->worklist above. It seems better + * than adding another barrier into __queue_work() which + * is a hotter path. + */ if (time_after(now, ts + thresh)) { + scoped_guard(raw_spinlock_irqsave, &pool->lock) { + pool_ts = pool->last_progress_ts; + if (time_after(pool_ts, touched)) + ts = pool_ts; + else + ts = touched; + } + if (!time_after(now, ts + thresh)) + continue; + lockup_detected = true; stall_time = jiffies_to_msecs(now - pool_ts) / 1000; max_stall_time = max(max_stall_time, stall_time); @@ -7711,15 +7800,13 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", stall_time); } - - } if (lockup_detected) show_all_workqueues(); if (cpu_pool_stall) - show_cpu_pools_hogs(); + show_cpu_pools_busy_workers(); if (lockup_detected) panic_on_wq_watchdog(max_stall_time); @@ -7845,8 +7932,8 @@ void __init workqueue_init_early(void) { struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM]; int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; - void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal, - bh_pool_kick_highpri }; + void (*irq_work_fns[NR_STD_WORKER_POOLS])(struct irq_work *) = + { bh_pool_kick_normal, bh_pool_kick_highpri }; int i, cpu; BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); @@ -7858,7 +7945,6 @@ void __init workqueue_init_early(void) cpumask_copy(wq_online_cpumask, cpu_online_mask); cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); - restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ)); restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN)); if (!cpumask_empty(&wq_cmdline_cpumask)) restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); @@ -7942,11 +8028,12 @@ void __init workqueue_init_early(void) system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0); + system_dfl_long_wq = alloc_workqueue("events_dfl_long", WQ_UNBOUND, WQ_MAX_ACTIVE); BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq || - !system_bh_wq || !system_bh_highpri_wq); + !system_bh_wq || !system_bh_highpri_wq || !system_dfl_long_wq); } static void __init wq_cpu_intensive_thresh_init(void) @@ -8112,6 +8199,186 @@ static bool __init cpus_share_numa(int cpu0, int cpu1) return cpu_to_node(cpu0) == cpu_to_node(cpu1); } +/* Maps each CPU to its shard index within the LLC pod it belongs to */ +static int cpu_shard_id[NR_CPUS] __initdata; + +/** + * llc_count_cores - count distinct cores (SMT groups) within an LLC pod + * @pod_cpus: the cpumask of CPUs in the LLC pod + * @smt_pods: the SMT pod type, used to identify sibling groups + * + * A core is represented by the lowest-numbered CPU in its SMT group. Returns + * the number of distinct cores found in @pod_cpus. + */ +static int __init llc_count_cores(const struct cpumask *pod_cpus, + struct wq_pod_type *smt_pods) +{ + const struct cpumask *sibling_cpus; + int nr_cores = 0, c; + + /* + * Count distinct cores by only counting the first CPU in each + * SMT sibling group. + */ + for_each_cpu(c, pod_cpus) { + sibling_cpus = smt_pods->pod_cpus[smt_pods->cpu_pod[c]]; + if (cpumask_first(sibling_cpus) == c) + nr_cores++; + } + + return nr_cores; +} + +/* + * llc_shard_size - number of cores in a given shard + * + * Cores are spread as evenly as possible. The first @nr_large_shards shards are + * "large shards" with (cores_per_shard + 1) cores; the rest are "default + * shards" with cores_per_shard cores. + */ +static int __init llc_shard_size(int shard_id, int cores_per_shard, int nr_large_shards) +{ + /* The first @nr_large_shards shards are large shards */ + if (shard_id < nr_large_shards) + return cores_per_shard + 1; + + /* The remaining shards are default shards */ + return cores_per_shard; +} + +/* + * llc_calc_shard_layout - compute the shard layout for an LLC pod + * @nr_cores: number of distinct cores in the LLC pod + * + * Chooses the number of shards that keeps average shard size closest to + * wq_cache_shard_size. Returns a struct describing the total number of shards, + * the base size of each, and how many are large shards. + */ +static struct llc_shard_layout __init llc_calc_shard_layout(int nr_cores) +{ + struct llc_shard_layout layout; + + /* Ensure at least one shard; pick the count closest to the target size */ + layout.nr_shards = max(1, DIV_ROUND_CLOSEST(nr_cores, wq_cache_shard_size)); + layout.cores_per_shard = nr_cores / layout.nr_shards; + layout.nr_large_shards = nr_cores % layout.nr_shards; + + return layout; +} + +/* + * llc_shard_is_full - check whether a shard has reached its core capacity + * @cores_in_shard: number of cores already assigned to this shard + * @shard_id: index of the shard being checked + * @layout: the shard layout computed by llc_calc_shard_layout() + * + * Returns true if @cores_in_shard equals the expected size for @shard_id. + */ +static bool __init llc_shard_is_full(int cores_in_shard, int shard_id, + const struct llc_shard_layout *layout) +{ + return cores_in_shard == llc_shard_size(shard_id, layout->cores_per_shard, + layout->nr_large_shards); +} + +/** + * llc_populate_cpu_shard_id - populate cpu_shard_id[] for each CPU in an LLC pod + * @pod_cpus: the cpumask of CPUs in the LLC pod + * @smt_pods: the SMT pod type, used to identify sibling groups + * @nr_cores: number of distinct cores in @pod_cpus (from llc_count_cores()) + * + * Walks @pod_cpus in order. At each SMT group leader, advances to the next + * shard once the current shard is full. Results are written to cpu_shard_id[]. + */ +static void __init llc_populate_cpu_shard_id(const struct cpumask *pod_cpus, + struct wq_pod_type *smt_pods, + int nr_cores) +{ + struct llc_shard_layout layout = llc_calc_shard_layout(nr_cores); + const struct cpumask *sibling_cpus; + /* Count the number of cores in the current shard_id */ + int cores_in_shard = 0; + unsigned int leader; + /* This is a cursor for the shards. Go from zero to nr_shards - 1*/ + int shard_id = 0; + int c; + + /* Iterate at every CPU for a given LLC pod, and assign it a shard */ + for_each_cpu(c, pod_cpus) { + sibling_cpus = smt_pods->pod_cpus[smt_pods->cpu_pod[c]]; + if (cpumask_first(sibling_cpus) == c) { + /* This is the CPU leader for the siblings */ + if (llc_shard_is_full(cores_in_shard, shard_id, &layout)) { + shard_id++; + cores_in_shard = 0; + } + cores_in_shard++; + cpu_shard_id[c] = shard_id; + } else { + /* + * The siblings' shard MUST be the same as the leader. + * never split threads in the same core. + */ + leader = cpumask_first(sibling_cpus); + + /* + * This check silences a Warray-bounds warning on UP + * configs where NR_CPUS=1 makes cpu_shard_id[] + * a single-element array, and the compiler can't + * prove the index is always 0. + */ + if (WARN_ON_ONCE(leader >= nr_cpu_ids)) + continue; + cpu_shard_id[c] = cpu_shard_id[leader]; + } + } + + WARN_ON_ONCE(shard_id != (layout.nr_shards - 1)); +} + +/** + * precompute_cache_shard_ids - assign each CPU its shard index within its LLC + * + * Iterates over all LLC pods. For each pod, counts distinct cores then assigns + * shard indices to all CPUs in the pod. Must be called after WQ_AFFN_CACHE and + * WQ_AFFN_SMT have been initialized. + */ +static void __init precompute_cache_shard_ids(void) +{ + struct wq_pod_type *llc_pods = &wq_pod_types[WQ_AFFN_CACHE]; + struct wq_pod_type *smt_pods = &wq_pod_types[WQ_AFFN_SMT]; + const struct cpumask *cpus_sharing_llc; + int nr_cores; + int pod; + + if (!wq_cache_shard_size) { + pr_warn("workqueue: cache_shard_size must be > 0, setting to 1\n"); + wq_cache_shard_size = 1; + } + + for (pod = 0; pod < llc_pods->nr_pods; pod++) { + cpus_sharing_llc = llc_pods->pod_cpus[pod]; + + /* Number of cores in this given LLC */ + nr_cores = llc_count_cores(cpus_sharing_llc, smt_pods); + llc_populate_cpu_shard_id(cpus_sharing_llc, smt_pods, nr_cores); + } +} + +/* + * cpus_share_cache_shard - test whether two CPUs belong to the same cache shard + * + * Two CPUs share a cache shard if they are in the same LLC and have the same + * shard index. Used as the pod affinity callback for WQ_AFFN_CACHE_SHARD. + */ +static bool __init cpus_share_cache_shard(int cpu0, int cpu1) +{ + if (!cpus_share_cache(cpu0, cpu1)) + return false; + + return cpu_shard_id[cpu0] == cpu_shard_id[cpu1]; +} + /** * workqueue_init_topology - initialize CPU pods for unbound workqueues * @@ -8127,6 +8394,8 @@ void __init workqueue_init_topology(void) init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share); init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt); init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache); + precompute_cache_shard_ids(); + init_pod_type(&wq_pod_types[WQ_AFFN_CACHE_SHARD], cpus_share_cache_shard); init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); wq_topo_initialized = true; diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index f6275944ada7..8def1ddc5a1b 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -32,6 +32,7 @@ struct worker { work_func_t current_func; /* K: function */ struct pool_workqueue *current_pwq; /* K: pwq */ u64 current_at; /* K: runtime at start or last wakeup */ + unsigned long current_start; /* K: start time of current work item */ unsigned int current_color; /* K: color */ int sleeping; /* S: is worker sleeping? */ |
