diff options
Diffstat (limited to 'kernel')
166 files changed, 15548 insertions, 6069 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 6785982013dc..1e1a31673577 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the linux kernel. # -obj-y = fork.o exec_domain.o panic.o \ +obj-y = fork.o exec_domain.o exec_state.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ diff --git a/kernel/acct.c b/kernel/acct.c index cbbf79d718cf..c440d43479ca 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -249,7 +249,7 @@ static int acct_on(const char __user *name) return -EINVAL; /* Exclude procfs and sysfs. */ - if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) + if (file_inode(file)->i_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED) return -EINVAL; if (!(file->f_mode & FMODE_CAN_WRITE)) diff --git a/kernel/audit.c b/kernel/audit.c index e1d489bc2dff..dcc657d35776 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1468,6 +1468,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, err = audit_list_rules_send(skb, seq); break; case AUDIT_TRIM: + if (audit_enabled == AUDIT_LOCKED) + return -EPERM; audit_trim_trees(); audit_log_common_recv_msg(audit_context(), &ab, AUDIT_CONFIG_CHANGE); @@ -1480,6 +1482,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, size_t msglen = data_len; char *old, *new; + if (audit_enabled == AUDIT_LOCKED) + return -EPERM; err = -EINVAL; if (msglen < 2 * sizeof(u32)) break; @@ -2030,7 +2034,7 @@ void audit_log_vformat(struct audit_buffer *ab, const char *fmt, va_list args) * here and AUDIT_BUFSIZ is at least 1024, then we can * log everything that printk could have logged. */ avail = audit_expand(ab, - max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); + max_t(unsigned int, AUDIT_BUFSIZ, 1+len-avail)); if (!avail) goto out_va_end; len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); diff --git a/kernel/audit.h b/kernel/audit.h index ac81fa02bcd7..92d5e723d570 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -233,7 +233,7 @@ static inline int audit_hash_ino(u64 ino) /* Indicates that audit should log the full pathname. */ #define AUDIT_NAME_FULL -1 -extern int audit_match_class(int class, unsigned syscall); +extern int audit_match_class(int class, unsigned int syscall); extern int audit_comparator(const u32 left, const u32 op, const u32 right); extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); @@ -256,8 +256,13 @@ extern int audit_del_rule(struct audit_entry *entry); extern void audit_free_rule_rcu(struct rcu_head *head); extern struct list_head audit_filter_list[]; -extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); +struct audit_watch_ctx { + struct inode *dir; + struct inode *child; +}; +extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, + struct audit_watch_ctx *ctx); extern void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm); @@ -280,13 +285,15 @@ extern char *audit_watch_path(struct audit_watch *watch); extern int audit_watch_compare(struct audit_watch *watch, u64 ino, dev_t dev); extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, - char *pathname, int len); + char *pathname, int len, + struct audit_watch_ctx *ctx); extern char *audit_mark_path(struct audit_fsnotify_mark *mark); extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); extern void audit_remove_mark_rule(struct audit_krule *krule); extern int audit_mark_compare(struct audit_fsnotify_mark *mark, u64 ino, dev_t dev); -extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); +extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old, + struct audit_watch_ctx *ctx); extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); @@ -317,13 +324,13 @@ extern struct list_head *audit_killed_trees(void); #define audit_watch_path(w) "" #define audit_watch_compare(w, i, d) 0 -#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL)) +#define audit_alloc_mark(k, p, l, c) (ERR_PTR(-EINVAL)) #define audit_mark_path(m) "" #define audit_remove_mark(m) do { } while (0) #define audit_remove_mark_rule(k) do { } while (0) #define audit_mark_compare(m, i, d) 0 #define audit_exe_compare(t, m) (-EINVAL) -#define audit_dupe_exe(n, o) (-EINVAL) +#define audit_dupe_exe(n, o, c) (-EINVAL) #define audit_remove_tree_rule(rule) BUG() #define audit_add_tree_rule(rule) -EINVAL diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 711454f9f724..fa33d57e4320 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -71,22 +71,29 @@ static void audit_update_mark(struct audit_fsnotify_mark *audit_mark, audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET; } -struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len) +struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, + int len, struct audit_watch_ctx *ctx) { struct audit_fsnotify_mark *audit_mark; struct path path; struct dentry *dentry; - int ret; + struct inode *dir, *child; + int ret, allow_dups; if (pathname[0] != '/' || pathname[len-1] == '/') return ERR_PTR(-EINVAL); - dentry = kern_path_parent(pathname, &path); - if (IS_ERR(dentry)) - return ERR_CAST(dentry); /* returning an error */ - if (d_really_is_negative(dentry)) { - audit_mark = ERR_PTR(-ENOENT); - goto out; + if (!ctx) { + dentry = kern_path_parent(pathname, &path); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); /* returning an error */ + dir = d_inode(path.dentry); + child = d_inode(dentry); + allow_dups = 0; + } else { + dir = ctx->dir; + child = ctx->child; + allow_dups = 1; } audit_mark = kzalloc_obj(*audit_mark); @@ -98,18 +105,21 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group); audit_mark->mark.mask = AUDIT_FS_EVENTS; audit_mark->path = pathname; - audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0); + audit_update_mark(audit_mark, child); + ret = fsnotify_add_inode_mark(&audit_mark->mark, dir, allow_dups); + if (ret < 0) { audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); } out: - dput(dentry); - path_put(&path); + if (!ctx) { + dput(dentry); + path_put(&path); + } return audit_mark; } diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index ee84777fdfad..1ed19b775912 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -33,7 +33,7 @@ struct audit_chunk { struct audit_node { struct list_head list; struct audit_tree *owner; - unsigned index; /* index; upper bit indicates 'will prune' */ + unsigned int index; /* index; upper bit indicates 'will prune' */ } owners[] __counted_by(count); }; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 33577f0f54ef..06dd0ebe73e2 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -244,7 +244,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc /* Update inode info in audit rules based on filesystem event. */ static void audit_update_watch(struct audit_parent *parent, const struct qstr *dname, dev_t dev, - u64 ino, unsigned invalidating) + u64 ino, unsigned int invalidating, + struct audit_watch_ctx *ctx) { struct audit_watch *owatch, *nwatch, *nextw; struct audit_krule *r, *nextr; @@ -280,7 +281,7 @@ static void audit_update_watch(struct audit_parent *parent, list_del(&oentry->rule.rlist); list_del_rcu(&oentry->list); - nentry = audit_dupe_rule(&oentry->rule); + nentry = audit_dupe_rule(&oentry->rule, ctx); if (IS_ERR(nentry)) { list_del(&oentry->rule.list); audit_panic("error updating watch, removing"); @@ -479,10 +480,17 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask, if (WARN_ON_ONCE(inode_mark->group != audit_watch_group)) return 0; - if (mask & (FS_CREATE|FS_MOVED_TO) && inode) - audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); - else if (mask & (FS_DELETE|FS_MOVED_FROM)) - audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1); + if (mask & (FS_CREATE|FS_MOVED_TO) && inode) { + struct audit_watch_ctx ctx = { .dir = dir, .child = inode }; + + audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0, + &ctx); + } else if (mask & (FS_DELETE|FS_MOVED_FROM)) { + struct audit_watch_ctx ctx = { .dir = dir, .child = NULL }; + + audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1, + &ctx); + } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); @@ -505,7 +513,8 @@ static int __init audit_watch_init(void) } device_initcall(audit_watch_init); -int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old) +int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old, + struct audit_watch_ctx *ctx) { struct audit_fsnotify_mark *audit_mark; char *pathname; @@ -514,7 +523,7 @@ int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old) if (!pathname) return -ENOMEM; - audit_mark = audit_alloc_mark(new, pathname, strlen(pathname)); + audit_mark = audit_alloc_mark(new, pathname, strlen(pathname), ctx); if (IS_ERR(audit_mark)) { kfree(pathname); return PTR_ERR(audit_mark); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 093425123f6c..4401119b5275 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -165,13 +165,13 @@ static inline int audit_to_inode(struct audit_krule *krule, static __u32 *classes[AUDIT_SYSCALL_CLASSES]; -int __init audit_register_class(int class, unsigned *list) +int __init audit_register_class(int class, unsigned int *list) { __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); if (!p) return -ENOMEM; while (*list != ~0U) { - unsigned n = *list++; + unsigned int n = *list++; if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) { kfree(p); return -EINVAL; @@ -186,7 +186,7 @@ int __init audit_register_class(int class, unsigned *list) return 0; } -int audit_match_class(int class, unsigned syscall) +int audit_match_class(int class, unsigned int syscall) { if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32)) return 0; @@ -237,7 +237,7 @@ static int audit_match_signal(struct audit_entry *entry) /* Common user-space to kernel rule translation. */ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule) { - unsigned listnr; + unsigned int listnr; struct audit_entry *entry; int i, err; @@ -589,7 +589,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, err = PTR_ERR(str); goto exit_free; } - audit_mark = audit_alloc_mark(&entry->rule, str, f_val); + audit_mark = audit_alloc_mark(&entry->rule, str, f_val, NULL); if (IS_ERR(audit_mark)) { kfree(str); err = PTR_ERR(audit_mark); @@ -816,7 +816,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, * rule with the new rule in the filterlist, then free the old rule. * The rlist element is undefined; list manipulations are handled apart from * the initial copy. */ -struct audit_entry *audit_dupe_rule(struct audit_krule *old) +struct audit_entry *audit_dupe_rule(struct audit_krule *old, + struct audit_watch_ctx *ctx) { u32 fcount = old->field_count; struct audit_entry *entry; @@ -875,7 +876,7 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old) new->filterkey = fk; break; case AUDIT_EXE: - err = audit_dupe_exe(new, old); + err = audit_dupe_exe(new, old, ctx); break; } if (err) { @@ -1414,7 +1415,7 @@ static int update_lsm_rule(struct audit_krule *r) if (!security_audit_rule_known(r)) return 0; - nentry = audit_dupe_rule(r); + nentry = audit_dupe_rule(r, NULL); if (entry->rule.exe) audit_remove_mark(entry->rule.exe); if (IS_ERR(nentry)) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ab54fccba215..6610e667c728 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -150,7 +150,7 @@ static const struct audit_nfcfgop_tab audit_nfcfgs[] = { static int audit_match_perm(struct audit_context *ctx, int mask) { - unsigned n; + unsigned int n; if (unlikely(!ctx)) return 0; @@ -2786,7 +2786,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old) context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; - context->capset.cap.inheritable = new->cap_effective; + context->capset.cap.inheritable = new->cap_inheritable; context->capset.cap.permitted = new->cap_permitted; context->capset.cap.ambient = new->cap_ambient; context->type = AUDIT_CAPSET; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 399007b67a92..4dc41bf5780c 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o const_fold.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o cnum.o log.o token.o liveness.o const_fold.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o bpf_insn_array.o diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 802656c6fd3c..af49c154473d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -53,12 +53,15 @@ struct bpf_arena { u64 user_vm_start; u64 user_vm_end; struct vm_struct *kern_vm; + struct page *scratch_page; struct range_tree rt; /* protects rt */ rqspinlock_t spinlock; struct list_head vma_list; /* protects vma_list */ struct mutex lock; + u64 zap_gen; + struct mutex zap_mutex; struct irq_work free_irq; struct work_struct free_work; struct llist_head free_spans; @@ -83,6 +86,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) return arena ? arena->user_vm_start : 0; } +/** + * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map * + * @map: a BPF_MAP_TYPE_ARENA map + * + * Return @map's kern_vm_start. + */ +u64 bpf_arena_map_kern_vm_start(struct bpf_map *map) +{ + return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map)); +} + +/** + * bpf_prog_arena - return the bpf_map of the arena referenced by @prog + * @prog: a loaded BPF program + * + * The verifier enforces at most one arena per program and stores it in + * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if + * @prog does not reference an arena. + */ +struct bpf_map *bpf_prog_arena(struct bpf_prog *prog) +{ + struct bpf_arena *arena = prog->aux->arena; + + return arena ? &arena->map : NULL; +} + static long arena_map_peek_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; @@ -115,26 +144,57 @@ static long compute_pgoff(struct bpf_arena *arena, long uaddr) struct apply_range_data { struct page **pages; + struct page *scratch_page; int i; }; +struct clear_range_data { + struct llist_head *free_pages; + struct page *scratch_page; +}; + static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data) { struct apply_range_data *d = data; struct page *page; + pte_t pteval; if (!data) return 0; - /* sanity check */ - if (unlikely(!pte_none(ptep_get(pte)))) - return -EBUSY; page = d->pages[d->i]; /* paranoia, similar to vmap_pages_pte_range() */ if (WARN_ON_ONCE(!pfn_valid(page_to_pfn(page)))) return -EINVAL; - set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); + pteval = mk_pte(page, PAGE_KERNEL); +#ifdef ptep_try_set + /* + * Kernel-fault recovery may have installed the scratch page here, and + * some architectures (arm64) prohibit valid->valid PTE transitions. + * Install atomically into a none slot. If scratch is present, clear it + * and flush_tlb_before_set() (break-before-make) before retrying. + */ + while (!ptep_try_set(pte, pteval)) { + pte_t old = ptep_get(pte); + + if (pte_none(old)) + continue; + if (WARN_ON_ONCE(pte_page(old) != d->scratch_page)) + return -EBUSY; + ptep_get_and_clear(&init_mm, addr, pte); + flush_tlb_before_set(addr); + } +#else + /* + * Without ptep_try_set() there is no atomic installer, but such arches + * also do not wire up bpf_arena_handle_page_fault(), so no scratch page + * is ever installed and the slot is always none here. + */ + if (unlikely(!pte_none(ptep_get(pte)))) + return -EBUSY; + set_pte_at(&init_mm, addr, pte, pteval); +#endif d->i++; return 0; } @@ -144,33 +204,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size) flush_cache_vmap(start, start + size); } -static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages) +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data) { + struct clear_range_data *d = data; pte_t old_pte; struct page *page; - /* sanity check */ - old_pte = ptep_get(pte); + /* + * Pairs with ptep_try_set() in the kernel-fault scratch installer. + * Both sides must be atomic. + */ + old_pte = ptep_get_and_clear(&init_mm, addr, pte); if (pte_none(old_pte) || !pte_present(old_pte)) - return 0; /* nothing to do */ + return 0; page = pte_page(old_pte); if (WARN_ON_ONCE(!page)) return -EINVAL; - pte_clear(&init_mm, addr, pte); + /* + * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr + * scratches its PTE. A later bpf_arena_free_pages() over that range walks + * here. Without the skip, scratch_page would be freed. + */ + if (page == d->scratch_page) + return 0; + + __llist_add(&page->pcp_llist, d->free_pages); + return 0; +} - /* Add page to the list so it is freed later */ - if (free_pages) - __llist_add(&page->pcp_llist, free_pages); +static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data) +{ + struct page *scratch_page = data; + if (!pte_none(ptep_get(pte))) + return 0; + /* + * Best-effort install. ptep_try_set() returns false only if another + * installer (real allocation or concurrent fault) won the cmpxchg. + * Their PTE is already valid, so the access retry succeeds. + * + * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just + * cause one extra re-fault through this same path. + */ + ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL)); return 0; } static int populate_pgtable_except_pte(struct bpf_arena *arena) { + /* Populate intermediates for the recovery range (4 GiB + upper half-guard). */ return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), - KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL); + SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL); } static struct bpf_map *arena_map_alloc(union bpf_attr *attr) @@ -221,22 +307,30 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) init_irq_work(&arena->free_irq, arena_free_irq); INIT_WORK(&arena->free_work, arena_free_worker); bpf_map_init_from_attr(&arena->map, attr); + + err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page); + if (err) + goto err_free_arena; + range_tree_init(&arena->rt); err = range_tree_set(&arena->rt, 0, attr->max_entries); - if (err) { - bpf_map_area_free(arena); - goto err; - } + if (err) + goto err_free_scratch; mutex_init(&arena->lock); + mutex_init(&arena->zap_mutex); raw_res_spin_lock_init(&arena->spinlock); err = populate_pgtable_except_pte(arena); - if (err) { - range_tree_destroy(&arena->rt); - bpf_map_area_free(arena); - goto err; - } + if (err) + goto err_destroy_rt; return &arena->map; + +err_destroy_rt: + range_tree_destroy(&arena->rt); +err_free_scratch: + __free_page(arena->scratch_page); +err_free_arena: + bpf_map_area_free(arena); err: free_vm_area(kern_vm); return ERR_PTR(err); @@ -244,6 +338,7 @@ err: static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) { + struct bpf_arena *arena = data; struct page *page; pte_t pte; @@ -252,6 +347,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data) return 0; page = pte_page(pte); /* + * Skip the scratch page. The walk is page-table-driven, not range-tree-driven, + * so it can visit scratch PTEs at uaddrs the BPF program never allocated. + */ + if (page == arena->scratch_page) + return 0; + /* * We do not update pte here: * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug * 2. TLB flushing is batched or deferred. Even if we clear pte, @@ -286,9 +387,10 @@ static void arena_map_free(struct bpf_map *map) * free those pages. */ apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena), - KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL); + SZ_4G + GUARD_SZ / 2, existing_page_cb, arena); free_vm_area(arena->kern_vm); range_tree_destroy(&arena->rt); + __free_page(arena->scratch_page); bpf_map_area_free(arena); } @@ -318,6 +420,7 @@ struct vma_list { struct vm_area_struct *vma; struct list_head head; refcount_t mmap_count; + u64 zap_gen; }; static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) @@ -330,6 +433,7 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) refcount_set(&vml->mmap_count, 1); vma->vm_private_data = vml; vml->vma = vma; + vml->zap_gen = 0; list_add(&vml->head, &arena->vma_list); return 0; } @@ -384,33 +488,38 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_RETRY; page = vmalloc_to_page((void *)kaddr); - if (page) + if (page) { + if (page == arena->scratch_page) + /* BPF triggered scratch here; don't lazy-alloc over it */ + goto out_sigsegv; /* already have a page vmap-ed */ goto out; + } bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT) /* User space requested to segfault when page is not allocated by bpf prog */ - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; ret = range_tree_clear(&arena->rt, vmf->pgoff, 1); if (ret) - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; - struct apply_range_data data = { .pages = &page, .i = 0 }; + struct apply_range_data data = { .pages = &page, .i = 0, + .scratch_page = arena->scratch_page }; /* Account into memcg of the process that created bpf_arena */ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; } ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); free_pages_nolock(page, 0); - goto out_unlock_sigsegv; + goto out_sigsegv_memcg; } flush_vmap_cache(kaddr, PAGE_SIZE); bpf_map_memcg_exit(old_memcg, new_memcg); @@ -419,8 +528,9 @@ out: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); vmf->page = page; return 0; -out_unlock_sigsegv: +out_sigsegv_memcg: bpf_map_memcg_exit(old_memcg, new_memcg); +out_sigsegv: raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); return VM_FAULT_SIGSEGV; } @@ -511,7 +621,7 @@ static int arena_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 { struct bpf_arena *arena = container_of(map, struct bpf_arena, map); - if ((u64)off > arena->user_vm_end - arena->user_vm_start) + if ((u64)off >= arena->user_vm_end - arena->user_vm_start) return -ERANGE; *imm = (unsigned long)arena->user_vm_start; return 0; @@ -587,6 +697,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } data.pages = pages; + data.scratch_page = arena->scratch_page; if (raw_res_spin_lock_irqsave(&arena->spinlock, flags)) goto out_free_pages; @@ -668,12 +779,60 @@ out_free_pages: */ static void zap_pages(struct bpf_arena *arena, long uaddr, long page_cnt) { + unsigned long size = (unsigned long)page_cnt << PAGE_SHIFT; + struct vm_area_struct *vma; + struct mm_struct *mm; struct vma_list *vml; + unsigned long vm_start; + u64 my_gen; - guard(mutex)(&arena->lock); - /* iterate link list under lock */ - list_for_each_entry(vml, &arena->vma_list, head) - zap_vma_range(vml->vma, uaddr, PAGE_SIZE * page_cnt); + /* + * Taking mmap_read_lock() under arena->lock would deadlock against + * arena_vm_close(), which runs with mmap_write_lock held and then + * acquires arena->lock. Drop arena->lock for mmap_read_lock(). + * + * Use per-call my_gen, recorded in vml->zap_gen, to remember which + * vmls this invocation has already processed across the lock drop. + * Hold zap_mutex around the whole walk so concurrent zap_pages() + * callers cannot overwrite each other's marks on shared vmls -- + * otherwise call B's mark would make call A skip a vml that A has + * not yet zapped for A's uaddr range. + */ + mutex_lock(&arena->zap_mutex); + mutex_lock(&arena->lock); + my_gen = ++arena->zap_gen; + for (;;) { + mm = NULL; + list_for_each_entry(vml, &arena->vma_list, head) { + if (vml->zap_gen >= my_gen) + continue; + vml->zap_gen = my_gen; + if (!mmget_not_zero(vml->vma->vm_mm)) + continue; + mm = vml->vma->vm_mm; + vm_start = vml->vma->vm_start; + break; + } + if (!mm) + break; + mutex_unlock(&arena->lock); + + mmap_read_lock(mm); + /* + * Re-resolve: while we waited the VMA could have been unmapped + * and a different mapping installed at the same address. + */ + vma = find_vma(mm, vm_start); + if (vma && vma->vm_start == vm_start && + vma->vm_file && vma->vm_file->private_data == &arena->map) + zap_vma_range(vma, uaddr, size); + mmap_read_unlock(mm); + mmput(mm); + + mutex_lock(&arena->lock); + } + mutex_unlock(&arena->lock); + mutex_unlock(&arena->zap_mutex); } static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, bool sleepable) @@ -685,6 +844,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, struct llist_head free_pages; struct llist_node *pos, *t; struct arena_free_span *s; + struct clear_range_data cdata; unsigned long flags; int ret = 0; @@ -713,9 +873,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt, range_tree_set(&arena->rt, pgoff, page_cnt); init_llist_head(&free_pages); + cdata.free_pages = &free_pages; + cdata.scratch_page = arena->scratch_page; /* clear ptes and collect struct pages */ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, - apply_range_clear_cb, &free_pages); + apply_range_clear_cb, &cdata); /* drop the lock to do the tlb flush and zap pages */ raw_res_spin_unlock_irqrestore(&arena->spinlock, flags); @@ -805,6 +967,7 @@ static void arena_free_worker(struct work_struct *work) struct arena_free_span *s; u64 arena_vm_start, user_vm_start; struct llist_head free_pages; + struct clear_range_data cdata; struct page *page; unsigned long full_uaddr; long kaddr, page_cnt, pgoff; @@ -818,6 +981,8 @@ static void arena_free_worker(struct work_struct *work) bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg); init_llist_head(&free_pages); + cdata.free_pages = &free_pages; + cdata.scratch_page = arena->scratch_page; arena_vm_start = bpf_arena_get_kern_vm_start(arena); user_vm_start = bpf_arena_get_user_vm_start(arena); @@ -830,7 +995,7 @@ static void arena_free_worker(struct work_struct *work) /* clear ptes and collect pages in free_pages llist */ apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT, - apply_range_clear_cb, &free_pages); + apply_range_clear_cb, &cdata); range_tree_set(&arena->rt, pgoff, page_cnt); } @@ -893,6 +1058,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false); } + +void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, + int node_id, u64 flags) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt) + return NULL; + + return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true); +} + __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt) { struct bpf_map *map = p__map; @@ -945,23 +1123,12 @@ static int __init kfunc_init(void) } late_initcall(kfunc_init); -void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write, + unsigned long addr, unsigned long fault_ip) { struct bpf_stream_stage ss; - struct bpf_prog *prog; u64 user_vm_start; - /* - * The RCU read lock is held to safely traverse the latch tree, but we - * don't need its protection when accessing the prog, since it will not - * disappear while we are handling the fault. - */ - rcu_read_lock(); - prog = bpf_prog_ksym_find(fault_ip); - rcu_read_unlock(); - if (!prog) - return; - /* Use main prog for stream access */ prog = prog->aux->main_prog_aux->prog; @@ -974,3 +1141,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo bpf_stream_dump_stack(ss); })); } + +bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip) +{ + struct bpf_arena *arena; + struct bpf_prog *prog; + unsigned long kbase; + unsigned long page_addr = addr & PAGE_MASK; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return false; + + arena = prog->aux->arena; + /* a prog not using arena may be on stack, so arena can be NULL */ + if (!arena) + return false; + + kbase = bpf_arena_get_kern_vm_start(arena); + + /* + * Recovery covers the 4 GiB mappable band plus the upper half-guard. + * Lower guard is unreachable from kfuncs; an address there indicates + * a different bug class - leave it to the regular kernel oops path. + */ + if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2) + return false; + + apply_to_page_range(&init_mm, page_addr, PAGE_SIZE, + apply_range_set_scratch_cb, arena->scratch_page); + flush_vmap_cache(page_addr, PAGE_SIZE); + __bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip); + return true; +} + +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +{ + struct bpf_prog *prog; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it will not + * disappear while we are handling the fault. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(fault_ip); + rcu_read_unlock(); + if (!prog) + return; + __bpf_prog_report_arena_violation(prog, write, addr, fault_ip); +} diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 5e25e0353509..248b4818178c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -175,14 +175,12 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + (u64)array->elem_size * (index & array->index_mask); } -static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, - void *hash_buf) +static int array_map_get_hash(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); sha256(array->value, (u64)array->elem_size * array->map.max_entries, - hash_buf); - memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); + array->map.sha); return 0; } @@ -386,7 +384,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { val = this_cpu_ptr(array->pptrs[index & array->index_mask]); copy_map_value(map, val, value); - bpf_obj_free_fields(array->map.record, val); + bpf_obj_cancel_fields(map, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); @@ -394,7 +392,7 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - bpf_obj_free_fields(array->map.record, val); + bpf_obj_cancel_fields(map, val); } return 0; } @@ -434,14 +432,14 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, cpu = map_flags >> 32; ptr = per_cpu_ptr(pptr, cpu); copy_map_value(map, ptr, value); - bpf_obj_free_fields(array->map.record, ptr); + bpf_obj_cancel_fields(map, ptr); goto unlock; } for_each_possible_cpu(cpu) { ptr = per_cpu_ptr(pptr, cpu); val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; copy_map_value(map, ptr, val); - bpf_obj_free_fields(array->map.record, ptr); + bpf_obj_cancel_fields(map, ptr); } unlock: rcu_read_unlock(); @@ -827,7 +825,7 @@ const struct bpf_map_ops array_map_ops = { }; const struct bpf_map_ops percpu_array_map_ops = { - .map_meta_equal = bpf_map_meta_equal, + .map_meta_equal = array_map_meta_equal, .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, diff --git a/kernel/bpf/backtrack.c b/kernel/bpf/backtrack.c index 854731dc93fe..2e4ae0ef0860 100644 --- a/kernel/bpf/backtrack.c +++ b/kernel/bpf/backtrack.c @@ -9,7 +9,7 @@ /* for any branch, call, exit record the history of jmps in the given state */ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) + int insn_flags, int spi, int frame, u64 linked_regs) { u32 cnt = cur->jmp_history_cnt; struct bpf_jmp_history_entry *p; @@ -25,6 +25,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state env, "insn history: insn_idx %d cur flags %x new flags %x", env->insn_idx, env->cur_hist_ent->flags, insn_flags); env->cur_hist_ent->flags |= insn_flags; + env->cur_hist_ent->spi = spi; + env->cur_hist_ent->frame = frame; verifier_bug_if(env->cur_hist_ent->linked_regs != 0, env, "insn history: insn_idx %d linked_regs: %#llx", env->insn_idx, env->cur_hist_ent->linked_regs); @@ -43,6 +45,8 @@ int bpf_push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; + p->spi = spi; + p->frame = frame; p->linked_regs = linked_regs; cur->jmp_history_cnt = cnt; env->cur_hist_ent = p; @@ -64,16 +68,6 @@ static bool is_atomic_fetch_insn(const struct bpf_insn *insn) (insn->imm & BPF_FETCH); } -static int insn_stack_access_spi(int insn_flags) -{ - return (insn_flags >> INSN_F_SPI_SHIFT) & INSN_F_SPI_MASK; -} - -static int insn_stack_access_frameno(int insn_flags) -{ - return insn_flags & INSN_F_FRAMENO_MASK; -} - /* Backtrack one insn at a time. If idx is not at the top of recorded * history then previous instruction came from straight line execution. * Return -ENOENT if we exhausted all instructions within given state. @@ -135,11 +129,21 @@ static inline u32 bt_empty(struct backtrack_state *bt) int i; for (i = 0; i <= bt->frame; i++) - mask |= bt->reg_masks[i] | bt->stack_masks[i]; + mask |= bt->reg_masks[i] | bt->stack_masks[i] | bt->stack_arg_masks[i]; return mask == 0; } +static inline void bt_clear_frame_stack_arg_slot(struct backtrack_state *bt, u32 frame, u32 slot) +{ + bt->stack_arg_masks[frame] &= ~(1 << slot); +} + +static inline bool bt_is_frame_stack_arg_slot_set(struct backtrack_state *bt, u32 frame, u32 slot) +{ + return bt->stack_arg_masks[frame] & (1 << slot); +} + static inline int bt_subprog_enter(struct backtrack_state *bt) { if (bt->frame == MAX_CALL_FRAMES - 1) { @@ -200,6 +204,11 @@ static inline u64 bt_stack_mask(struct backtrack_state *bt) return bt->stack_masks[bt->frame]; } +static inline u8 bt_stack_arg_mask(struct backtrack_state *bt) +{ + return bt->stack_arg_masks[bt->frame]; +} + static inline bool bt_is_reg_set(struct backtrack_state *bt, u32 reg) { return bt->reg_masks[bt->frame] & (1 << reg); @@ -341,6 +350,19 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, return 0; bt_clear_reg(bt, load_reg); + if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) { + spi = hist->spi; + /* + * Stack arg read: callee reads from r11+off, but + * the data lives in the caller's stack_arg_regs. + * Set the mask in the caller frame so precision + * is marked in the caller's slot at the callee + * entry checkpoint. + */ + bt_set_frame_stack_arg_slot(bt, bt->frame - 1, spi); + return 0; + } + /* scalars can only be spilled into stack w/o losing precision. * Load from any other memory can be zero extended. * The desire to keep that precision is already indicated @@ -353,8 +375,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * that [fp - off] slot contains scalar that needs to be * tracked with precision */ - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); + spi = hist->spi; + fr = hist->frame; bpf_bt_set_frame_slot(bt, fr, spi); } else if (class == BPF_STX || class == BPF_ST) { if (bt_is_reg_set(bt, dreg)) @@ -363,11 +385,22 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * encountered a case of pointer subtraction. */ return -ENOTSUPP; + + if (hist && hist->flags & INSN_F_STACK_ARG_ACCESS) { + spi = hist->spi; + if (!bt_is_frame_stack_arg_slot_set(bt, bt->frame, spi)) + return 0; + bt_clear_frame_stack_arg_slot(bt, bt->frame, spi); + if (class == BPF_STX) + bt_set_reg(bt, sreg); + return 0; + } + /* scalars can only be spilled into stack */ if (!hist || !(hist->flags & INSN_F_STACK_ACCESS)) return 0; - spi = insn_stack_access_spi(hist->flags); - fr = insn_stack_access_frameno(hist->flags); + spi = hist->spi; + fr = hist->frame; if (!bt_is_frame_slot_set(bt, fr, spi)) return 0; bt_clear_frame_slot(bt, fr, spi); @@ -431,6 +464,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, bpf_bt_set_frame_reg(bt, bt->frame - 1, i); } } + if (bt_stack_arg_mask(bt)) { + verifier_bug(env, + "static subprog leftover stack arg slots %x", + bt_stack_arg_mask(bt)); + return -EFAULT; + } if (bt_subprog_exit(bt)) return -EFAULT; return 0; @@ -901,6 +940,17 @@ int bpf_mark_chain_precision(struct bpf_verifier_env *env, *changed = true; } } + for (i = 0; i < func->out_stack_arg_cnt; i++) { + if (!bt_is_frame_stack_arg_slot_set(bt, fr, i)) + continue; + reg = &func->stack_arg_regs[i]; + if (reg->type != SCALAR_VALUE || reg->precise) { + bt_clear_frame_stack_arg_slot(bt, fr, i); + } else { + reg->precise = true; + *changed = true; + } + } if (env->log.level & BPF_LOG_LEVEL2) { fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_frame_reg_mask(bt, fr)); diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e7a2fc60523f..5ed7cb4b98c0 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -13,23 +13,8 @@ #define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET -/* Helpers to get the local list index */ -#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) -#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) -#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) -/* Local list helpers */ -static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) -{ - return &loc_l->lists[LOCAL_FREE_LIST_IDX]; -} - -static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) -{ - return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; -} - /* bpf_lru_node helpers */ static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) { @@ -72,6 +57,7 @@ static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, bpf_lru_list_count_dec(l, node->type); node->type = tgt_free_type; + WRITE_ONCE(node->pending_free, 0); list_move(&node->list, free_list); } @@ -87,6 +73,9 @@ static void __bpf_lru_node_move_in(struct bpf_lru_list *l, bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; bpf_lru_node_clear_ref(node); + /* Reset pending_free only when moving to the free list */ + if (tgt_type == BPF_LRU_LIST_T_FREE) + WRITE_ONCE(node->pending_free, 0); list_move(&node->list, &l->lists[tgt_type]); } @@ -212,9 +201,11 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, unsigned int i = 0; list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { - if (bpf_lru_node_is_ref(node)) { + if (bpf_lru_node_is_ref(node) && + !READ_ONCE(node->pending_free)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); - } else if (lru->del_from_htab(lru->del_arg, node)) { + } else if (READ_ONCE(node->pending_free) || + lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); if (++nshrinked == tgt_nshrink) @@ -273,7 +264,8 @@ static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, list) { - if (lru->del_from_htab(lru->del_arg, node)) { + if (READ_ONCE(node->pending_free) || + lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); return 1; @@ -290,8 +282,10 @@ static void __local_list_flush(struct bpf_lru_list *l, struct bpf_lru_node *node, *tmp_node; list_for_each_entry_safe_reverse(node, tmp_node, - local_pending_list(loc_l), list) { - if (bpf_lru_node_is_ref(node)) + &loc_l->pending_list, list) { + if (READ_ONCE(node->pending_free)) + __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_FREE); + else if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move_in(l, node, @@ -307,9 +301,12 @@ static void bpf_lru_list_push_free(struct bpf_lru_list *l, if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) { + WRITE_ONCE(node->pending_free, 1); + return; + } __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); } static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, @@ -318,8 +315,10 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, struct bpf_lru_list *l = &lru->common_lru.lru_list; struct bpf_lru_node *node, *tmp_node; unsigned int nfree = 0; + LIST_HEAD(tmp_free); - raw_spin_lock(&l->lock); + if (raw_res_spin_lock(&l->lock)) + return; __local_list_flush(l, loc_l); @@ -327,7 +326,7 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], list) { - __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), + __bpf_lru_node_move_to_free(l, node, &tmp_free, BPF_LRU_LOCAL_LIST_T_FREE); if (++nfree == lru->target_free) break; @@ -335,10 +334,19 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, if (nfree < lru->target_free) __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, - local_free_list(loc_l), + &tmp_free, BPF_LRU_LOCAL_LIST_T_FREE); - raw_spin_unlock(&l->lock); + raw_res_spin_unlock(&l->lock); + + /* + * Transfer the harvested nodes from the temporary list_head into + * the lockless per-CPU free llist. + */ + list_for_each_entry_safe(node, tmp_node, &tmp_free, list) { + list_del(&node->list); + llist_add(&node->llist, &loc_l->free_llist); + } } static void __local_list_add_pending(struct bpf_lru *lru, @@ -350,22 +358,21 @@ static void __local_list_add_pending(struct bpf_lru *lru, *(u32 *)((void *)node + lru->hash_offset) = hash; node->cpu = cpu; node->type = BPF_LRU_LOCAL_LIST_T_PENDING; + WRITE_ONCE(node->pending_free, 0); bpf_lru_node_clear_ref(node); - list_add(&node->list, local_pending_list(loc_l)); + list_add(&node->list, &loc_l->pending_list); } static struct bpf_lru_node * __local_list_pop_free(struct bpf_lru_locallist *loc_l) { - struct bpf_lru_node *node; + struct llist_node *llnode; - node = list_first_entry_or_null(local_free_list(loc_l), - struct bpf_lru_node, - list); - if (node) - list_del(&node->list); + llnode = llist_del_first(&loc_l->free_llist); + if (!llnode) + return NULL; - return node; + return container_of(llnode, struct bpf_lru_node, llist); } static struct bpf_lru_node * @@ -376,10 +383,10 @@ __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) ignore_ref: /* Get from the tail (i.e. older element) of the pending list. */ - list_for_each_entry_reverse(node, local_pending_list(loc_l), - list) { + list_for_each_entry_reverse(node, &loc_l->pending_list, list) { if ((!bpf_lru_node_is_ref(node) || force) && - lru->del_from_htab(lru->del_arg, node)) { + (READ_ONCE(node->pending_free) || + lru->del_from_htab(lru->del_arg, node))) { list_del(&node->list); return node; } @@ -404,7 +411,8 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, l = per_cpu_ptr(lru->percpu_lru, cpu); - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) + return NULL; __bpf_lru_list_rotate(lru, l); @@ -420,7 +428,7 @@ static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); return node; } @@ -437,7 +445,8 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, loc_l = per_cpu_ptr(clru->local_list, cpu); - raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) + return NULL; node = __local_list_pop_free(loc_l); if (!node) { @@ -448,17 +457,22 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, if (node) __local_list_add_pending(lru, loc_l, cpu, node, hash); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); if (node) return node; - /* No free nodes found from the local free list and + /* + * No free nodes found from the local free list and * the global LRU list. * * Steal from the local free/pending list of the * current CPU and remote CPU in RR. It starts * with the loc_l->next_steal CPU. + * + * Acquire the victim's lock before touching either list. On + * acquisition failure (rqspinlock AA or timeout) skip the victim + * and try the next CPU. */ first_steal = loc_l->next_steal; @@ -466,24 +480,36 @@ static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, do { steal_loc_l = per_cpu_ptr(clru->local_list, steal); - raw_spin_lock_irqsave(&steal_loc_l->lock, flags); - - node = __local_list_pop_free(steal_loc_l); - if (!node) - node = __local_list_pop_pending(lru, steal_loc_l); - - raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + if (!raw_res_spin_lock_irqsave(&steal_loc_l->lock, flags)) { + node = __local_list_pop_free(steal_loc_l); + if (!node) + node = __local_list_pop_pending(lru, steal_loc_l); + raw_res_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + } steal = cpumask_next_wrap(steal, cpu_possible_mask); } while (!node && steal != first_steal); loc_l->next_steal = steal; - if (node) { - raw_spin_lock_irqsave(&loc_l->lock, flags); - __local_list_add_pending(lru, loc_l, cpu, node, hash); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + if (!node) + return NULL; + + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) { + /* + * The local pending lock can't be acquired (rqspinlock AA + * or timeout). Return the stolen node to the per-CPU + * free_llist instead of orphaning it; the next pop_free on + * this CPU will pick it up. + */ + node->type = BPF_LRU_LOCAL_LIST_T_FREE; + bpf_lru_node_clear_ref(node); + WRITE_ONCE(node->pending_free, 0); + llist_add(&node->llist, &loc_l->free_llist); + return NULL; } + __local_list_add_pending(lru, loc_l, cpu, node, hash); + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); return node; } @@ -511,18 +537,24 @@ static void bpf_common_lru_push_free(struct bpf_lru *lru, loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); - raw_spin_lock_irqsave(&loc_l->lock, flags); + if (raw_res_spin_lock_irqsave(&loc_l->lock, flags)) { + WRITE_ONCE(node->pending_free, 1); + return; + } if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + raw_res_spin_unlock_irqrestore(&loc_l->lock, + flags); goto check_lru_list; } node->type = BPF_LRU_LOCAL_LIST_T_FREE; bpf_lru_node_clear_ref(node); - list_move(&node->list, local_free_list(loc_l)); + list_del(&node->list); + + raw_res_spin_unlock_irqrestore(&loc_l->lock, flags); - raw_spin_unlock_irqrestore(&loc_l->lock, flags); + llist_add(&node->llist, &loc_l->free_llist); return; } @@ -538,11 +570,14 @@ static void bpf_percpu_lru_push_free(struct bpf_lru *lru, l = per_cpu_ptr(lru->percpu_lru, node->cpu); - raw_spin_lock_irqsave(&l->lock, flags); + if (raw_res_spin_lock_irqsave(&l->lock, flags)) { + WRITE_ONCE(node->pending_free, 1); + return; + } __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); - raw_spin_unlock_irqrestore(&l->lock, flags); + raw_res_spin_unlock_irqrestore(&l->lock, flags); } void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) @@ -565,6 +600,7 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, node = (struct bpf_lru_node *)(buf + node_offset); node->type = BPF_LRU_LIST_T_FREE; + node->pending_free = 0; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; @@ -594,6 +630,7 @@ again: node = (struct bpf_lru_node *)(buf + node_offset); node->cpu = cpu; node->type = BPF_LRU_LIST_T_FREE; + node->pending_free = 0; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); i++; @@ -618,14 +655,12 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) { - int i; - - for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) - INIT_LIST_HEAD(&loc_l->lists[i]); + INIT_LIST_HEAD(&loc_l->pending_list); + init_llist_head(&loc_l->free_llist); loc_l->next_steal = cpu; - raw_spin_lock_init(&loc_l->lock); + raw_res_spin_lock_init(&loc_l->lock); } static void bpf_lru_list_init(struct bpf_lru_list *l) @@ -640,7 +675,7 @@ static void bpf_lru_list_init(struct bpf_lru_list *l) l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; - raw_spin_lock_init(&l->lock); + raw_res_spin_lock_init(&l->lock); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index fe2661a58ea9..8d0ee61622af 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -6,11 +6,11 @@ #include <linux/cache.h> #include <linux/list.h> -#include <linux/spinlock_types.h> +#include <linux/llist.h> +#include <asm/rqspinlock.h> #define NR_BPF_LRU_LIST_T (3) #define NR_BPF_LRU_LIST_COUNT (2) -#define NR_BPF_LRU_LOCAL_LIST_T (2) #define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T enum bpf_lru_list_type { @@ -22,10 +22,22 @@ enum bpf_lru_list_type { }; struct bpf_lru_node { - struct list_head list; + /* + * A node is in at most one list at a time. The free path on the + * per-CPU locallist uses an llist, so share storage via a union. + */ + union { + struct list_head list; + struct llist_node llist; + }; u16 cpu; u8 type; u8 ref; + /* + * Marks nodes whose *_push_free() lock acquire failed; reclaimed + * by flush/shrink which honor the flag instead of del_from_htab(). + */ + u8 pending_free; }; struct bpf_lru_list { @@ -34,13 +46,14 @@ struct bpf_lru_list { /* The next inactive list rotation starts from here */ struct list_head *next_inactive_rotation; - raw_spinlock_t lock ____cacheline_aligned_in_smp; + rqspinlock_t lock ____cacheline_aligned_in_smp; }; struct bpf_lru_locallist { - struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; + struct list_head pending_list; + struct llist_head free_llist; u16 next_steal; - raw_spinlock_t lock; + rqspinlock_t lock; }; struct bpf_common_lru { diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index c5c925f00202..564071a92d7d 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -427,6 +427,26 @@ BTF_ID(func, bpf_lsm_audit_rule_known) BTF_ID(func, bpf_lsm_inode_xattr_skipcap) BTF_SET_END(bool_lsm_hooks) +/* hooks returning void */ +#define LSM_HOOK_void(DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME) +#define LSM_HOOK_int(DEFAULT, NAME, ...) /* nothing */ +#define LSM_HOOK(RET, DEFAULT, NAME, ...) LSM_HOOK_##RET(DEFAULT, NAME, __VA_ARGS__) +BTF_SET_START(void_lsm_hooks) +#include <linux/lsm_hook_defs.h> +#undef LSM_HOOK +#undef LSM_HOOK_void +#undef LSM_HOOK_int +BTF_SET_END(void_lsm_hooks) + +bool bpf_lsm_hook_returns_errno(u32 btf_id) +{ + if (btf_id_set_contains(&bool_lsm_hooks, btf_id)) + return false; + if (btf_id_set_contains(&void_lsm_hooks, btf_id)) + return false; + return true; +} + int bpf_lsm_get_retval_range(const struct bpf_prog *prog, struct bpf_retval_range *retval_range) { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 521cb9d7e8c7..51b16e5f5534 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -594,8 +594,8 @@ const struct bpf_link_ops bpf_struct_ops_link_lops = { .dealloc = bpf_struct_ops_link_dealloc, }; -int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, - struct bpf_tramp_link *link, +int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_nodes *tnodes, + struct bpf_tramp_node *node, const struct btf_func_model *model, void *stub_func, void **_image, u32 *_image_off, @@ -605,13 +605,13 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, void *image = *_image; int size; - tlinks[BPF_TRAMP_FENTRY].links[0] = link; - tlinks[BPF_TRAMP_FENTRY].nr_links = 1; + tnodes[BPF_TRAMP_FENTRY].nodes[0] = node; + tnodes[BPF_TRAMP_FENTRY].nr_nodes = 1; if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; - size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func); + size = arch_bpf_trampoline_size(model, flags, tnodes, stub_func); if (size <= 0) return size ? : -EFAULT; @@ -628,7 +628,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, size = arch_prepare_bpf_trampoline(NULL, image + image_off, image + image_off + size, - model, flags, tlinks, stub_func); + model, flags, tnodes, stub_func); if (size <= 0) { if (image != *_image) bpf_struct_ops_image_free(image); @@ -693,7 +693,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, const struct btf_type *module_type; const struct btf_member *member; const struct btf_type *t = st_ops_desc->type; - struct bpf_tramp_links *tlinks; + struct bpf_tramp_nodes *tnodes; void *udata, *kdata; int prog_fd, err; u32 i, trampoline_start, image_off = 0; @@ -720,8 +720,8 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (uvalue->common.state || refcount_read(&uvalue->common.refcnt)) return -EINVAL; - tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX); - if (!tlinks) + tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX); + if (!tnodes) return -ENOMEM; uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; @@ -817,8 +817,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, err = -ENOMEM; goto reset_unlock; } - bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, - &bpf_struct_ops_link_lops, prog, prog->expected_attach_type); + bpf_tramp_link_init(link, BPF_LINK_TYPE_STRUCT_OPS, + &bpf_struct_ops_link_lops, prog, prog->expected_attach_type, 0); + *plink++ = &link->link; /* Poison pointer on error instead of return for backward compatibility */ @@ -832,7 +833,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, *pksym++ = ksym; trampoline_start = image_off; - err = bpf_struct_ops_prepare_trampoline(tlinks, link, + err = bpf_struct_ops_prepare_trampoline(tnodes, &link->node, &st_ops->func_models[i], *(void **)(st_ops->cfi_stubs + moff), &image, &image_off, @@ -911,7 +912,7 @@ reset_unlock: memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: - kfree(tlinks); + kfree(tnodes); mutex_unlock(&st_map->lock); if (!err) bpf_struct_ops_map_add_ksyms(st_map); @@ -1204,6 +1205,42 @@ u32 bpf_struct_ops_id(const void *kdata) } EXPORT_SYMBOL_GPL(bpf_struct_ops_id); +/** + * bpf_struct_ops_for_each_prog - Invoke @cb for each member prog + * @kdata: kernel-side struct_ops vmtable (the @kdata arg to ->reg/->update/->unreg) + * @cb: callback invoked once per member prog; non-zero return stops iteration + * @data: opaque argument passed to @cb + * + * Walks the struct_ops member progs registered on the map containing @kdata. + * Intended for use from struct_ops ->reg() callbacks (and similar) that need to + * inspect the loaded BPF programs (for example to discover maps they reference + * via @prog->aux->used_maps). + * + * Return 0 if iteration completed, otherwise the first non-zero @cb return. + */ +int bpf_struct_ops_for_each_prog(const void *kdata, + int (*cb)(struct bpf_prog *prog, void *data), + void *data) +{ + struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; + u32 i; + int ret; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); + + for (i = 0; i < st_map->funcs_cnt; i++) { + if (!st_map->links[i]) + continue; + ret = cb(st_map->links[i]->prog, data); + if (ret) + return ret; + } + return 0; +} +EXPORT_SYMBOL_GPL(bpf_struct_ops_for_each_prog); + static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a62d78581207..15ae7c43f594 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -182,7 +182,6 @@ #define BITS_ROUNDUP_BYTES(bits) \ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) -#define BTF_INFO_MASK 0x9f00ffff #define BTF_INT_MASK 0x0fffffff #define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) #define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) @@ -289,7 +288,7 @@ enum verifier_phase { struct resolve_vertex { const struct btf_type *t; u32 type_id; - u16 next_member; + u32 next_member; }; enum visit_state { @@ -2031,7 +2030,7 @@ static int env_stack_push(struct btf_verifier_env *env, } static void env_stack_set_next_member(struct btf_verifier_env *env, - u16 next_member) + u32 next_member) { env->stack[env->top_stack - 1].next_member = next_member; } @@ -3293,7 +3292,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, struct btf *btf = env->btf; u32 struct_size = t->size; u32 offset; - u16 i; + u32 i; meta_needed = btf_type_vlen(t) * sizeof(*member); if (meta_left < meta_needed) { @@ -3369,7 +3368,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env, { const struct btf_member *member; int err; - u16 i; + u32 i; /* Before continue resolving the next_member, * ensure the last member is indeed resolved to a @@ -3668,7 +3667,7 @@ end: static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, u32 field_cnt, u32 repeat_cnt, u32 elem_size) { - u32 i, j; + u32 i, j, total_cnt, total_repeats; u32 cur; /* Ensure not repeating fields that should not be repeated. */ @@ -3686,10 +3685,9 @@ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, } } - /* The type of struct size or variable size is u32, - * so the multiplication will not overflow. - */ - if (field_cnt * (repeat_cnt + 1) > info_cnt) + if (check_add_overflow(repeat_cnt, 1, &total_repeats) || + check_mul_overflow(field_cnt, total_repeats, &total_cnt) || + total_cnt > (u32)info_cnt) return -E2BIG; cur = field_cnt; @@ -4447,7 +4445,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env, const struct btf_enum *enums = btf_type_enum(t); struct btf *btf = env->btf; const char *fmt_str; - u16 i, nr_enums; + u32 i, nr_enums; u32 meta_needed; nr_enums = btf_type_vlen(t); @@ -4555,7 +4553,7 @@ static s32 btf_enum64_check_meta(struct btf_verifier_env *env, const struct btf_enum64 *enums = btf_type_enum64(t); struct btf *btf = env->btf; const char *fmt_str; - u16 i, nr_enums; + u32 i, nr_enums; u32 meta_needed; nr_enums = btf_type_vlen(t); @@ -4683,7 +4681,7 @@ static void btf_func_proto_log(struct btf_verifier_env *env, const struct btf_type *t) { const struct btf_param *args = (const struct btf_param *)(t + 1); - u16 nr_args = btf_type_vlen(t), i; + u32 nr_args = btf_type_vlen(t), i; btf_verifier_log(env, "return=%u args=(", t->type); if (!nr_args) { @@ -4929,7 +4927,7 @@ static int btf_datasec_resolve(struct btf_verifier_env *env, { const struct btf_var_secinfo *vsi; struct btf *btf = env->btf; - u16 i; + u32 i; env->resolve_mode = RESOLVE_TBD; for_each_vsi_from(i, v->next_member, v->t, vsi) { @@ -5183,7 +5181,7 @@ static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *ret_type; const struct btf_param *args; const struct btf *btf; - u16 nr_args, i; + u32 nr_args, i; int err; btf = env->btf; @@ -5278,7 +5276,7 @@ static int btf_func_check(struct btf_verifier_env *env, const struct btf_type *proto_type; const struct btf_param *args; const struct btf *btf; - u16 nr_args, i; + u32 nr_args, i; btf = env->btf; proto_type = btf_type_by_id(btf, t->type); @@ -5336,12 +5334,6 @@ static s32 btf_check_meta(struct btf_verifier_env *env, } meta_left -= sizeof(*t); - if (t->info & ~BTF_INFO_MASK) { - btf_verifier_log(env, "[%u] Invalid btf_info:%x", - env->log_type_id, t->info); - return -EINVAL; - } - if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { btf_verifier_log(env, "[%u] Invalid kind:%u", @@ -5914,25 +5906,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env, return 0; } -static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size) -{ - u32 log_true_size; - int err; - - err = bpf_vlog_finalize(log, &log_true_size); - - if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) && - copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size), - &log_true_size, sizeof(log_true_size))) - err = -EFAULT; - - return err; -} - -static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, + struct bpf_log_attr *attr_log) { bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel); - char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf); struct btf_struct_metas *struct_meta_tab; struct btf_verifier_env *env = NULL; struct btf *btf = NULL; @@ -5949,8 +5926,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat /* user could have requested verbose verifier output * and supplied buffer to store the verification trace */ - err = bpf_vlog_init(&env->log, attr->btf_log_level, - log_ubuf, attr->btf_log_size); + err = bpf_vlog_init(&env->log, attr_log->level, attr_log->ubuf, attr_log->size); if (err) goto errout_free; @@ -6015,7 +5991,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat } } - err = finalize_log(&env->log, uattr, uattr_size); + err = bpf_log_attr_finalize(attr_log, &env->log); if (err) goto errout_free; @@ -6027,7 +6003,7 @@ errout_meta: btf_free_struct_meta_tab(btf); errout: /* overwrite err with -ENOSPC or -EFAULT */ - ret = finalize_log(&env->log, uattr, uattr_size); + ret = bpf_log_attr_finalize(attr_log, &env->log); if (ret) err = ret; errout_free: @@ -6980,7 +6956,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = ctx_arg_info->reg_type; info->btf = ctx_arg_info->btf ? : btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; - info->ref_obj_id = ctx_arg_info->ref_obj_id; + info->ref_id = ctx_arg_info->ref_id; return true; } } @@ -7825,6 +7801,134 @@ enum btf_arg_tag { ARG_TAG_ARENA = BIT_ULL(5), }; +static int btf_scan_decl_tags(struct bpf_verifier_env *env, + const struct btf *btf, + const struct btf_type *fn_t, + u32 arg_idx, bool is_global, u32 *tags) +{ + int id = btf_named_start_id(btf, false) - 1; + const char tag_key[] = "arg:"; + static const struct { + const char *tag_value; + enum btf_arg_tag arg_tag; + } tag_values[] = { + { "ctx", ARG_TAG_CTX }, + { "trusted", ARG_TAG_TRUSTED }, + { "untrusted", ARG_TAG_UNTRUSTED }, + { "nonnull", ARG_TAG_NONNULL }, + { "nullable", ARG_TAG_NULLABLE }, + { "arena", ARG_TAG_ARENA }, + }; + + /* + * The 'arg:<tag>' decl_tag takes precedence over the derivation + * of the register type from the BTF type itself. + */ + while ((id = btf_find_next_decl_tag(btf, fn_t, arg_idx, tag_key, id)) > 0) { + const struct btf_type *tag_t; + const char *tag; + int i; + bool found; + + /* disallow arg tags in static subprogs */ + if (!is_global) { + bpf_log(&env->log, + "arg#%d type tag is not supported in static functions\n", + arg_idx); + return -EOPNOTSUPP; + } + + tag_t = btf_type_by_id(btf, id); + tag = __btf_name_by_offset(btf, tag_t->name_off) + (sizeof(tag_key) - 1); + + found = false; + for (i = 0; i < ARRAY_SIZE(tag_values); ++i) { + if (!strcmp(tag, tag_values[i].tag_value)) { + *tags |= tag_values[i].arg_tag; + found = true; + break; + } + } + + if (!found) { + bpf_log(&env->log, "arg#%d has unsupported set of tags\n", arg_idx); + return -EOPNOTSUPP; + } + } + if (id != -ENOENT) { + bpf_log(&env->log, "arg#%d type tag fetching failure: %d\n", arg_idx, id); + return id; + } + + return 0; +} + +static int btf_scan_type_tags(struct bpf_verifier_env *env, + const struct btf *btf, u32 type_id, + u32 *tags) +{ + const struct btf_type *t; + + /* Find the first pointer type in the chain. */ + t = btf_type_skip_modifiers(btf, type_id, NULL); + + /* + * We currently reject type tags on non-pointer types, + * which neither LLVM nor GCC support anyway. + */ + if (!t || !btf_type_is_ptr(t)) + return 0; + + /* We got a pointer, get all associated type tags. */ + for (t = btf_type_by_id(btf, t->type); t && btf_type_is_modifier(t); + t = btf_type_by_id(btf, t->type)) { + + /* Skip non-type tag modifiers. */ + if (!btf_type_is_type_tag(t)) + continue; + + const char *tag = __btf_name_by_offset(btf, t->name_off); + + if (strcmp(tag, "arena") == 0) { + *tags |= ARG_TAG_ARENA; + } else { + bpf_log(&env->log, "function signature member has unsupported type tag '%s'\n", + tag); + return -EOPNOTSUPP; + } + } + + return 0; +} + +/* Check whether the type is a valid return type. */ +static int btf_validate_return_type(struct bpf_verifier_env *env, struct btf *btf, + const struct btf_type *t, int subprog) +{ + u32 tags = 0; + int err; + + err = btf_scan_type_tags(env, btf, t->type, &tags); + if (err) + return err; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + /* + * We allow all subprogs except for the main one to return any kind of arena pointer. + * General arena variables are not allowed, since it makes no sense to return by value + * a variable that's on the heap in the first place. + */ + if (subprog && (tags & ARG_TAG_ARENA) && btf_type_is_ptr(t)) + return 0; + + /* We always accept void or scalars. */ + if (btf_type_is_void(t) || btf_type_is_int(t) || btf_is_any_enum(t)) + return 0; + + return -EOPNOTSUPP; +} + /* Process BTF of a function to produce high-level expectation of function * arguments (like ARG_PTR_TO_CTX, or ARG_PTR_TO_MEM, etc). This information * is cached in subprog info for reuse. @@ -7843,6 +7947,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) struct btf *btf = prog->aux->btf; const struct btf_param *args; const struct btf_type *t, *ref_t, *fn_t; + int err; u32 i, nargs, btf_id; const char *tname; @@ -7887,25 +7992,36 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); + sub->arg_cnt = nargs; + if (nargs > MAX_BPF_FUNC_ARGS) { + bpf_log(log, "kernel supports at most %d parameters, function %s has %d\n", + MAX_BPF_FUNC_ARGS, tname, nargs); + return -EFAULT; + } if (nargs > MAX_BPF_FUNC_REG_ARGS) { - if (!is_global) - return -EINVAL; - bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", + if (!bpf_jit_supports_stack_args()) { + bpf_log(log, "JIT does not support function %s() with %d args\n", + tname, nargs); + return -EFAULT; + } + sub->stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS; + } + + if (is_global && nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "global function %s has %d > %d args, stack args not supported\n", tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } - /* check that function is void or returns int, exception cb also requires this */ - t = btf_type_by_id(btf, t->type); - while (btf_type_is_modifier(t)) - t = btf_type_by_id(btf, t->type); - if (!btf_type_is_void(t) && !btf_type_is_int(t) && !btf_is_any_enum(t)) { - if (!is_global) - return -EINVAL; - bpf_log(log, - "Global function %s() return value not void or scalar. " - "Only those are supported.\n", - tname); - return -EINVAL; + + err = btf_validate_return_type(env, btf, t, subprog); + if (err) { + if (is_global) { + bpf_log(log, + "Global function %s() return value not void or scalar. " + "Only those are supported.\n", + tname); + } + return err; } /* Convert BTF function arguments into verifier types. @@ -7913,42 +8029,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) */ for (i = 0; i < nargs; i++) { u32 tags = 0; - int id = btf_named_start_id(btf, false) - 1; - - /* 'arg:<tag>' decl_tag takes precedence over derivation of - * register type from BTF type itself - */ - while ((id = btf_find_next_decl_tag(btf, fn_t, i, "arg:", id)) > 0) { - const struct btf_type *tag_t = btf_type_by_id(btf, id); - const char *tag = __btf_name_by_offset(btf, tag_t->name_off) + 4; - - /* disallow arg tags in static subprogs */ - if (!is_global) { - bpf_log(log, "arg#%d type tag is not supported in static functions\n", i); - return -EOPNOTSUPP; - } + err = btf_scan_decl_tags(env, btf, fn_t, i, is_global, &tags); + if (err) + return err; - if (strcmp(tag, "ctx") == 0) { - tags |= ARG_TAG_CTX; - } else if (strcmp(tag, "trusted") == 0) { - tags |= ARG_TAG_TRUSTED; - } else if (strcmp(tag, "untrusted") == 0) { - tags |= ARG_TAG_UNTRUSTED; - } else if (strcmp(tag, "nonnull") == 0) { - tags |= ARG_TAG_NONNULL; - } else if (strcmp(tag, "nullable") == 0) { - tags |= ARG_TAG_NULLABLE; - } else if (strcmp(tag, "arena") == 0) { - tags |= ARG_TAG_ARENA; - } else { - bpf_log(log, "arg#%d has unsupported set of tags\n", i); - return -EOPNOTSUPP; - } - } - if (id != -ENOENT) { - bpf_log(log, "arg#%d type tag fetching failure: %d\n", i, id); - return id; - } + err = btf_scan_type_tags(env, btf, args[i].type, &tags); + if (err) + return err; t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) @@ -7973,7 +8060,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) bpf_log(log, "arg#%d has invalid combination of tags\n", i); return -EINVAL; } - sub->args[i].arg_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY; + sub->args[i].arg_type = ARG_PTR_TO_DYNPTR; continue; } if (tags & ARG_TAG_TRUSTED) { @@ -8074,7 +8161,6 @@ skip_pointer: return -EINVAL; } - sub->arg_cnt = nargs; sub->args_cached = true; return 0; @@ -8196,12 +8282,12 @@ static int __btf_new_fd(struct btf *btf) return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); } -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) { struct btf *btf; int ret; - btf = btf_parse(attr, uattr, uattr_size); + btf = btf_parse(attr, uattr, attr_log); if (IS_ERR(btf)) return PTR_ERR(btf); @@ -8684,6 +8770,39 @@ static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, return 0; } +static int btf_check_kfunc_name(struct btf *btf, const char *func_name, u32 kind) +{ +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; +#endif + s32 id; + + if (!btf_is_module(btf)) + return 0; + + id = btf_find_by_name_kind(bpf_get_btf_vmlinux(), func_name, kind); + if (id >= 0) { + pr_err("kfunc %s (id: %d) is already present in vmlinux.\n", + func_name, id); + return -EINVAL; + } + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + guard(mutex)(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->btf == btf) + continue; + id = btf_find_by_name_kind(btf_mod->btf, func_name, kind); + if (id >= 0) { + pr_err("kfunc %s (id: %d) is already present in module %s.\n", + func_name, id, btf_mod->module->name); + return -EINVAL; + } + } +#endif + return 0; +} + static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) { const struct btf_type *func; @@ -8697,7 +8816,8 @@ static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) /* sanity check kfunc name */ func_name = btf_name_by_offset(btf, func->name_off); - if (!func_name || !func_name[0]) + if (!func_name || !func_name[0] || + btf_check_kfunc_name(btf, func_name, BTF_INFO_KIND(func->info))) return -EINVAL; func = btf_type_by_id(btf, func->type); diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c index 998f42a8189a..26d37066465f 100644 --- a/kernel/bpf/cfg.c +++ b/kernel/bpf/cfg.c @@ -64,11 +64,19 @@ static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) subprog->might_sleep = true; } +static void mark_subprog_might_throw(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = bpf_find_containing_subprog(env, off); + subprog->might_throw = true; +} + /* 't' is an index of a call-site. * 'w' is a callee entry point. * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. * Rely on DFS traversal order and absence of recursive calls to guarantee that - * callee's change_pkt_data marks would be correct at that moment. + * callee's effect marks would be correct at that moment. */ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) { @@ -78,6 +86,7 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) callee = bpf_find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; caller->might_sleep |= callee->might_sleep; + caller->might_throw |= callee->might_throw; } enum { @@ -509,6 +518,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_subprog_might_sleep(env, t); if (ret == 0 && bpf_is_kfunc_pkt_changing(&meta)) mark_subprog_changes_pkt_data(env, t); + if (ret == 0 && bpf_is_throw_kfunc(insn)) + mark_subprog_might_throw(env, t); } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 876f6a81a9b6..83ce66296ac1 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -55,6 +55,28 @@ void __init cgroup_bpf_lifetime_notifier_init(void) &cgroup_bpf_lifetime_nb)); } +#ifdef CONFIG_BPF_LSM +struct cgroup_lsm_atype { + u32 attach_btf_id; + int refcnt; + bool returns_errno; +}; + +static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; + +static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype) +{ + if (atype >= CGROUP_LSM_START && atype <= CGROUP_LSM_END) + return READ_ONCE(cgroup_lsm_atype[atype - CGROUP_LSM_START].returns_errno); + return true; +} +#else +static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype) +{ + return true; +} +#endif + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -83,7 +105,8 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, *(ret_flags) |= (func_ret >> 1); func_ret &= 1; } - if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval)) + if (!func_ret && cgroup_bpf_hook_returns_errno(atype) && + !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; item++; } @@ -156,13 +179,6 @@ unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, } #ifdef CONFIG_BPF_LSM -struct cgroup_lsm_atype { - u32 attach_btf_id; - int refcnt; -}; - -static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; - static enum cgroup_bpf_attach_type bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) { @@ -191,10 +207,13 @@ void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) lockdep_assert_held(&cgroup_mutex); - WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id && - cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); - - cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + if (!cgroup_lsm_atype[i].attach_btf_id) { + cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; + WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, + bpf_lsm_hook_returns_errno(attach_btf_id)); + } else { + WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); + } cgroup_lsm_atype[i].refcnt++; } @@ -203,8 +222,10 @@ void bpf_cgroup_atype_put(int cgroup_atype) int i = cgroup_atype - CGROUP_LSM_START; cgroup_lock(); - if (--cgroup_lsm_atype[i].refcnt <= 0) + if (--cgroup_lsm_atype[i].refcnt <= 0) { + WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, true); cgroup_lsm_atype[i].attach_btf_id = 0; + } WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); cgroup_unlock(); } @@ -1208,7 +1229,7 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* Must be called with cgroup_mutex held to avoid races. */ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; @@ -1259,7 +1280,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return -EFAULT; if (!effective_query && from_atype == to_atype) revision = cgrp->bpf.revisions[from_atype]; - if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) + if (uattr_size >= offsetofend(union bpf_attr, query.revision) && + copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) return -EFAULT; if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ @@ -1312,12 +1334,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { int ret; cgroup_lock(); - ret = __cgroup_bpf_query(cgrp, attr, uattr); + ret = __cgroup_bpf_query(cgrp, attr, uattr, uattr_size); cgroup_unlock(); return ret; } @@ -1520,7 +1542,7 @@ out_put_cgroup: } int cgroup_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { struct cgroup *cgrp; int ret; @@ -1529,7 +1551,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, if (IS_ERR(cgrp)) return PTR_ERR(cgrp); - ret = cgroup_bpf_query(cgrp, attr, uattr); + ret = cgroup_bpf_query(cgrp, attr, uattr, uattr_size); cgroup_put(cgrp); return ret; @@ -1935,8 +1957,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); - if (ret == 1 && ctx.new_updated) { - kfree(*buf); + if (!ret && ctx.new_updated) { + kvfree(*buf); *buf = ctx.new_val; *pcount = ctx.new_len; } else { @@ -2342,6 +2364,7 @@ BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, return -E2BIG; memcpy(ctx->new_val, buf, buf_len); + ((char *)ctx->new_val)[buf_len] = '\0'; ctx->new_len = buf_len; ctx->new_updated = 1; diff --git a/kernel/bpf/cnum.c b/kernel/bpf/cnum.c new file mode 100644 index 000000000000..86142cb2aee5 --- /dev/null +++ b/kernel/bpf/cnum.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#include <linux/bits.h> + +#define T 32 +#include "cnum_defs.h" +#undef T + +#define T 64 +#include "cnum_defs.h" +#undef T + +struct cnum32 cnum32_from_cnum64(struct cnum64 cnum) +{ + if (cnum64_is_empty(cnum)) + return CNUM32_EMPTY; + + if (cnum.size >= U32_MAX) + return (struct cnum32){ .base = 0, .size = U32_MAX }; + else + return (struct cnum32){ .base = (u32)cnum.base, .size = cnum.size }; +} + +/* + * Suppose 'a' and 'b' are laid out as follows: + * + * 64-bit number axis ---> + * + * N*2^32 (N+1)*2^32 (N+2)*2^32 (N+3)*2^32 + * ||------|---|=====|-------||----------|=====|-------||----------|=====|----|--|| + * | |< b >| |< b >| |< b >| | + * | | | | + * |<--+--------------------------- a ---------------------------+--->| + * | | + * |<-------------------------- t -------------------------->| + * + * In such a case it is possible to infer a more tight representation t + * such that ∀ v ∈ a, (u32)v ∈ b: v ∈ t. + */ +struct cnum64 cnum64_cnum32_intersect(struct cnum64 a, struct cnum32 b) +{ + /* + * To simplify reasoning, rotate the circles so that [virtual] a1 starts + * at u32 boundary, b1 represents b in this new frame of reference. + */ + struct cnum32 b1 = { b.base - (u32)a.base, b.size }; + struct cnum64 t = a; + u64 d, b1_max; + + if (cnum64_is_empty(a) || cnum32_is_empty(b)) + return CNUM64_EMPTY; + + if (cnum32_urange_overflow(b1)) { + b1_max = (u32)b1.base + (u32)b1.size; /* overflow here is fine and necessary */ + if ((u32)a.size > b1_max && (u32)a.size < b1.base) { + /* + * N*2^32 (N+1)*2^32 + * ||=====|------------|=====||=====|---------|---|=====|| + * |b1 ->| |<- b1||b1 ->| | |<- b1| + * |<----------------- a1 ------------------>| + * |<-------------- t ------------>|<-- d -->| (after adjustment) + * ^ + * b1_max + */ + d = (u32)a.size - b1_max; + t.size -= d; + } else { + /* + * No adjustments possible in the following cases: + * + * ||=====|------------|=====||===|=|-------------|=|===|| + * |b1 ->| |<- b1||b1 +>| |<+ b1| + * |<----------------- a1 ------>| | + * |<----------------- (or) a1 ------------------->| + */ + } + } else { + if (t.size < b1.base) + /* + * N*2^32 (N+1)*2^32 + * ||----------|--|=======|--||------> + * |<-- a1 -->| |<- b ->| + */ + return CNUM64_EMPTY; + /* + * N*2^32 (N+1)*2^32 + * ||-------------|========|-||-----| -------|========|-|| + * | |<- b1 ->| | |<- b1 ->| + * |<------------+ a1 ------------>| + * |<------ t ------>| (after adjustment) + */ + t.base += b1.base; + t.size -= b1.base; + b1_max = b1.base + b1.size; + d = 0; + if ((u32)a.size < b1.base) + /* + * N*2^32 (N+1)*2^32 + * ||-------------|========|-||------|-------|========|-|| + * | |<- b1 ->| | |<- b1 ->| + * |<------------+-- a1 --+-------->| + * |<- t ->|<-- d -->| (after adjustment) + */ + d = (u32)a.size + (BIT_ULL(32) - b1_max); + else if ((u32)a.size >= b1_max) + /* + * N*2^32 (N+1)*2^32 + * ||--|========|------------||--|========|-------|-----|| + * | |<- b1 ->| |<- b1 ->| | + * |<-+------------------ a1 ------------+------>| + * |<-------------- t --------------->|<- d ->| (after adjustment) + */ + d = (u32)a.size - b1_max; + if (t.size < d) + return CNUM64_EMPTY; + t.size -= d; + } + return t; +} diff --git a/kernel/bpf/cnum_defs.h b/kernel/bpf/cnum_defs.h new file mode 100644 index 000000000000..a90e317e3578 --- /dev/null +++ b/kernel/bpf/cnum_defs.h @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ + +#ifndef T +#error "Define T (bit width: 32, 64) before including cnum_defs.h" +#endif + +#include <linux/cnum.h> +#include <linux/kernel.h> +#include <linux/limits.h> +#include <linux/minmax.h> +#include <linux/compiler_types.h> + +#define cnum_t __PASTE(cnum, T) +#define ut __PASTE(u, T) +#define st __PASTE(s, T) +#define UT_MAX __PASTE(__PASTE(U, T), _MAX) +#define ST_MAX __PASTE(__PASTE(S, T), _MAX) +#define ST_MIN __PASTE(__PASTE(S, T), _MIN) +#define EMPTY __PASTE(__PASTE(CNUM, T), _EMPTY) +#define FN(name) __PASTE(__PASTE(cnum, T), __PASTE(_, name)) + +struct cnum_t FN(from_urange)(ut min, ut max) +{ + return (struct cnum_t){ .base = min, .size = (ut)max - min }; +} + +struct cnum_t FN(from_srange)(st min, st max) +{ + ut size = (ut)max - (ut)min; + ut base = size == UT_MAX ? 0 : (ut)min; + + return (struct cnum_t){ .base = base, .size = size }; +} + +/* True if this cnum represents two unsigned ranges. */ +static inline bool FN(urange_overflow)(struct cnum_t cnum) +{ + /* Same as cnum.base + cnum.size > UT_MAX but avoids overflow */ + return cnum.size > UT_MAX - (ut)cnum.base; +} + +/* + * cnum{T}_umin / cnum{T}_umax query an unsigned range represented by this cnum. + * If cnum represents a range crossing the UT_MAX/0 boundary, the unbound range + * [0..UT_MAX] is returned. + */ +ut FN(umin)(struct cnum_t cnum) +{ + return FN(urange_overflow)(cnum) ? 0 : cnum.base; +} +EXPORT_SYMBOL_GPL(FN(umin)); + +ut FN(umax)(struct cnum_t cnum) +{ + return FN(urange_overflow)(cnum) ? UT_MAX : cnum.base + cnum.size; +} +EXPORT_SYMBOL_GPL(FN(umax)); + +/* True if this cnum represents two signed ranges. */ +static inline bool FN(srange_overflow)(struct cnum_t cnum) +{ + return FN(contains)(cnum, (ut)ST_MAX) && FN(contains)(cnum, (ut)ST_MIN); +} + +/* + * cnum{T}_smin / cnum{T}_smax query a signed range represented by this cnum. + * If cnum represents a range crossing the ST_MAX/ST_MIN boundary, the unbound range + * [ST_MIN..ST_MAX] is returned. + */ +st FN(smin)(struct cnum_t cnum) +{ + return FN(srange_overflow)(cnum) + ? ST_MIN + : min((st)cnum.base, (st)(cnum.base + cnum.size)); +} + +st FN(smax)(struct cnum_t cnum) +{ + return FN(srange_overflow)(cnum) + ? ST_MAX + : max((st)cnum.base, (st)(cnum.base + cnum.size)); +} + +/* + * Returns a possibly empty intersection of cnums 'a' and 'b'. + * If 'a' and 'b' intersect in two sub-arcs, the function over-approximates + * and returns either 'a' or 'b', whichever is smaller. + */ +struct cnum_t FN(intersect)(struct cnum_t a, struct cnum_t b) +{ + struct cnum_t b1; + ut dbase; + + if (FN(is_empty)(a) || FN(is_empty)(b)) + return EMPTY; + + if (a.base > b.base) + swap(a, b); + + /* + * Rotate frame of reference such that a.base is 0. + * 'b1' is 'b' in this frame of reference. + */ + dbase = b.base - a.base; + b1 = (struct cnum_t){ dbase, b.size }; + if (FN(urange_overflow)(b1)) { + if (b1.base <= a.size) { + /* + * Rotated frame (a.base at origin): + * + * 0 UT_MAX + * |--------------------------------------------| + * [=== a ==========================] | + * [= b1 tail =] [========= b1 main ==========>] + * ^-- b1.base <= a.size + * + * 'a' and 'b' intersect in two disjoint arcs, + * can't represent as single cnum, over-approximate + * the result. + */ + return a.size <= b.size ? a : b; + } else { + /* + * Rotated frame (a.base at origin): + * + * 0 UT_MAX + * |--------------------------------------------| + * [=== a =============] | | + * [= b1 tail =] [======= b1 main ====>] + * ^-- b1.base > a.size + * + * Only 'b' tail intersects 'a'. + */ + return (struct cnum_t) { + .base = a.base, + .size = min(a.size, (ut)(b1.base + b1.size)), + }; + } + } else if (a.size >= b1.base) { + /* + * Rotated frame (a.base at origin): + * + * 0 UT_MAX + * |--------------------------------------------------| + * [=== a ==================================] | + * [== b1 =====================] + * + * 0 UT_MAX + * |--------------------------------------------------| + * [=== a ==================================] | + * [== b1 ====] + * ^-- b1.base <= a.size + * |<-- a.size - dbase -->| + * + * 'a' and 'b' intersect as one cnum. + */ + return (struct cnum_t) { + .base = b.base, + .size = min((ut)(a.size - dbase), b.size), + }; + } else { + return EMPTY; + } +} + +void FN(intersect_with)(struct cnum_t *dst, struct cnum_t src) +{ + *dst = FN(intersect)(*dst, src); +} + +void FN(intersect_with_urange)(struct cnum_t *dst, ut min, ut max) +{ + FN(intersect_with)(dst, FN(from_urange)(min, max)); +} + +void FN(intersect_with_srange)(struct cnum_t *dst, st min, st max) +{ + FN(intersect_with)(dst, FN(from_srange)(min, max)); +} + +static inline struct cnum_t FN(normalize)(struct cnum_t cnum) +{ + if (cnum.size == UT_MAX && cnum.base != 0 && cnum.base != (ut)ST_MAX) + cnum.base = 0; + return cnum; +} + +struct cnum_t FN(add)(struct cnum_t a, struct cnum_t b) +{ + if (FN(is_empty)(a) || FN(is_empty)(b)) + return EMPTY; + if (a.size > UT_MAX - b.size) + return (struct cnum_t){ 0, (ut)UT_MAX }; + else + return FN(normalize)((struct cnum_t){ a.base + b.base, a.size + b.size }); +} + +struct cnum_t FN(negate)(struct cnum_t a) +{ + if (FN(is_empty)(a)) + return EMPTY; + return FN(normalize)((struct cnum_t){ -((ut)a.base + a.size), a.size }); +} + +bool FN(is_empty)(struct cnum_t cnum) +{ + return cnum.base == EMPTY.base && cnum.size == EMPTY.size; +} + +bool FN(contains)(struct cnum_t cnum, ut v) +{ + if (FN(is_empty)(cnum)) + return false; + if (FN(urange_overflow)(cnum)) + return v >= cnum.base || v <= (ut)cnum.base + cnum.size; + else + return v >= cnum.base && v <= (ut)cnum.base + cnum.size; +} + +bool FN(is_const)(struct cnum_t cnum) +{ + return cnum.size == 0; +} + +bool FN(is_subset)(struct cnum_t bigger, struct cnum_t smaller) +{ + if (FN(is_empty(smaller))) + return true; + if (FN(is_empty(bigger))) + return false; + /* rotate both arcs such that 'bigger' starts at origin, hence does not overflow */ + smaller.base -= bigger.base; + bigger.base = 0; + if (FN(urange_overflow)(smaller) && bigger.size < UT_MAX) + return false; + return smaller.base + smaller.size <= bigger.size; +} + +#undef EMPTY +#undef cnum_t +#undef ut +#undef st +#undef UT_MAX +#undef ST_MAX +#undef ST_MIN +#undef FN diff --git a/kernel/bpf/const_fold.c b/kernel/bpf/const_fold.c index db73c4740b1e..b2a19acadb91 100644 --- a/kernel/bpf/const_fold.c +++ b/kernel/bpf/const_fold.c @@ -58,6 +58,14 @@ static void const_reg_xfer(struct bpf_verifier_env *env, struct const_arg_info * u8 opcode = BPF_OP(insn->code) | BPF_SRC(insn->code); int r; + /* Stack arg stores (r11-based) are outside the tracked register set. */ + if (is_stack_arg_st(insn) || is_stack_arg_stx(insn)) + return; + if (is_stack_arg_ldx(insn)) { + ci_out[insn->dst_reg] = unknown; + return; + } + switch (class) { case BPF_ALU: case BPF_ALU64: diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8b018ff48875..649cce41e13f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1299,8 +1299,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, u32 imm_rnd = get_random_u32(); s16 off; - BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); - BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(BPF_REG_PARAMS + 2 != MAX_BPF_JIT_REG); + BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); /* Constraints on AX register: * @@ -1582,6 +1582,16 @@ bool bpf_insn_is_indirect_target(const struct bpf_verifier_env *env, const struc insn_idx += prog->aux->subprog_start; return env->insn_aux_data[insn_idx].indirect_target; } + +u16 bpf_out_stack_arg_cnt(const struct bpf_verifier_env *env, const struct bpf_prog *prog) +{ + const struct bpf_subprog_info *sub; + + if (!env) + return 0; + sub = &env->subprog_info[prog->aux->func_idx]; + return sub->stack_arg_cnt - bpf_in_stack_arg_cnt(sub); +} #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, @@ -1771,6 +1781,9 @@ static u32 abs_s32(s32 x) return x >= 0 ? (u32)x : -(u32)x; } +static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, + const struct bpf_insn *insn); + /** * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers @@ -2077,10 +2090,9 @@ select_insn: CONT; JMP_CALL_ARGS: - BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, - BPF_R3, BPF_R4, - BPF_R5, - insn + insn->off + 1); + BPF_R0 = interpreters_args[insn->off](BPF_R1, BPF_R2, BPF_R3, + BPF_R4, BPF_R5, + insn + insn->imm + 1); CONT; JMP_TAIL_CALL: { @@ -2394,13 +2406,22 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) #undef PROG_NAME_LIST #ifdef CONFIG_BPF_SYSCALL -void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) +int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); - insn->off = (s16) insn->imm; - insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - - __bpf_call_base_args; + /* Prevent out-of-bounds read to interpreters_args */ + if (stack_depth > MAX_BPF_STACK) + return -EINVAL; + insn->off = (round_up(stack_depth, 32) / 32) - 1; insn->code = BPF_JMP | BPF_CALL_ARGS; + return 0; +} + +s32 bpf_call_args_imm(s16 idx) +{ + if (WARN_ON_ONCE(idx < 0 || idx >= ARRAY_SIZE(interpreters_args))) + return 0; + return BPF_CALL_IMM(interpreters_args[idx]); } #endif #endif @@ -2460,7 +2481,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, cookie = aux->cgroup_storage[i] ? aux->cgroup_storage[i]->cookie : 0; ret = map->owner->storage_cookie[i] == cookie || - !cookie; + (!cookie && !aux->tail_call_reachable); } if (ret && map->owner->attach_func_proto != aux->attach_func_proto) { @@ -3217,6 +3238,11 @@ bool __weak bpf_jit_supports_kfunc_call(void) return false; } +bool __weak bpf_jit_supports_stack_args(void) +{ + return false; +} + bool __weak bpf_jit_supports_far_kfunc_call(void) { return false; @@ -3352,6 +3378,12 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) } #ifdef CONFIG_BPF_SYSCALL +__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, + unsigned long fault_ip) +{ + return false; +} + static int __init bpf_global_ma_init(void) { int ret; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index cc0a43ebab6b..dc7b859e8bbf 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -581,6 +581,10 @@ static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, { struct xdp_frame *nxdpf; + /* Frags live outside the linear frame and cannot be cloned safely. */ + if (unlikely(xdp_frame_has_frags(xdpf))) + return -EOPNOTSUPP; + nxdpf = xdpf_clone(xdpf); if (!nxdpf) return -ENOMEM; @@ -706,6 +710,18 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, if (unlikely(err)) return err; + if (dst->xdp_prog && skb_cloned(skb)) { + struct sk_buff *nskb; + + nskb = skb_copy(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + nskb->mac_len = skb->mac_len; + consume_skb(skb); + skb = nskb; + } + /* Redirect has already succeeded semantically at this point, so we just * return 0 even if packet is dropped. Helper below takes care of * freeing skb. @@ -726,6 +742,9 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *nskb; int err; + if (unlikely(skb_is_nonlinear(skb))) + return -EOPNOTSUPP; + nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; diff --git a/kernel/bpf/fixups.c b/kernel/bpf/fixups.c index fba9e8c00878..3cf2cc6e3ab6 100644 --- a/kernel/bpf/fixups.c +++ b/kernel/bpf/fixups.c @@ -870,7 +870,7 @@ int bpf_convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_UNTRUSTED: /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike - * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot + * PTR_TO_BTF_ID, and an active referenced id, but the same cannot * be said once it is marked PTR_UNTRUSTED, hence we must handle * any faults for loads into such types. BPF_WRITE is disallowed * for this case. @@ -1250,9 +1250,9 @@ static int jit_subprogs(struct bpf_verifier_env *env) } if (!bpf_pseudo_call(insn)) continue; - insn->off = env->insn_aux_data[i].call_imm; - subprog = bpf_find_subprog(env, i + insn->off + 1); - insn->imm = subprog; + insn->imm = env->insn_aux_data[i].call_imm; + subprog = bpf_find_subprog(env, i + insn->imm + 1); + insn->off = subprog; } prog->jited = 1; @@ -1265,6 +1265,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->aux->real_func_cnt = env->subprog_cnt; prog->aux->bpf_exception_cb = (void *)func[env->exception_callback_subprog]->bpf_func; prog->aux->exception_boundary = func[0]->aux->exception_boundary; + prog->aux->stack_arg_sp_adjust = func[0]->aux->stack_arg_sp_adjust; bpf_prog_jit_attempt_done(prog); return 0; out_free: @@ -1378,9 +1379,21 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env) struct bpf_prog *prog = env->prog; struct bpf_insn *insn = prog->insnsi; bool has_kfunc_call = bpf_prog_has_kfunc_call(prog); - int i, depth; + int depth; #endif - int err = 0; + int i, err = 0; + + for (i = 0; i < env->subprog_cnt; i++) { + struct bpf_subprog_info *subprog = &env->subprog_info[i]; + u16 outgoing = subprog->stack_arg_cnt - bpf_in_stack_arg_cnt(subprog); + + if (subprog->max_out_stack_arg_cnt > outgoing) { + verbose(env, + "func#%d writes %u stack arg slots, but calls only require %u\n", + i, subprog->max_out_stack_arg_cnt, outgoing); + return -EINVAL; + } + } if (env->prog->jit_requested && !bpf_prog_is_offloaded(env->prog->aux)) { @@ -1395,6 +1408,12 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env) verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); return -EINVAL; } + for (i = 0; i < env->subprog_cnt; i++) { + if (bpf_in_stack_arg_cnt(&env->subprog_info[i])) { + verbose(env, "stack args are not supported in non-JITed programs\n"); + return -EINVAL; + } + } if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { /* When JIT fails the progs with bpf2bpf calls and tail_calls * have to be rejected, since interpreter doesn't support them yet. @@ -1416,7 +1435,12 @@ int bpf_fixup_call_args(struct bpf_verifier_env *env) depth = get_callee_stack_depth(env, insn, i); if (depth < 0) return depth; - bpf_patch_call_args(insn, depth); + err = bpf_patch_call_args(insn, depth); + if (err) { + verbose(env, "stack depth %d exceeds interpreter stack depth limit\n", + depth); + return err; + } } err = 0; #endif @@ -2162,6 +2186,8 @@ patch_map_ops_generic: insn->imm == BPF_FUNC_get_func_ret) { if (eatype == BPF_TRACE_FEXIT || eatype == BPF_TRACE_FSESSION || + eatype == BPF_TRACE_FEXIT_MULTI || + eatype == BPF_TRACE_FSESSION_MULTI || eatype == BPF_MODIFY_RETURN) { /* Load nr_args from ctx - 8 */ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3dd9b4924ae4..9f394e1aa2e8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -9,6 +9,7 @@ #include <linux/rculist_nulls.h> #include <linux/rcupdate_wait.h> #include <linux/random.h> +#include <linux/rhashtable.h> #include <uapi/linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/btf_ids.h> @@ -242,6 +243,10 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab) if (IS_ERR_OR_NULL(htab->map.record)) return; + /* + * Preallocated maps do not have a bpf_mem_alloc destructor, so fully + * destroy every element, including the extra elements. + */ if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); for (i = 0; i < num_entries; i++) { @@ -496,28 +501,26 @@ static void htab_dtor_ctx_free(void *ctx) kfree(ctx); } -static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *)) +static int bpf_ma_set_dtor(struct bpf_map *map, struct bpf_mem_alloc *ma, + void (*dtor)(void *, void *)) { - u32 key_size = htab->map.key_size; - struct bpf_mem_alloc *ma; struct htab_btf_record *hrec; int err; /* No need for dtors. */ - if (IS_ERR_OR_NULL(htab->map.record)) + if (IS_ERR_OR_NULL(map->record)) return 0; hrec = kzalloc(sizeof(*hrec), GFP_KERNEL); if (!hrec) return -ENOMEM; - hrec->key_size = key_size; - hrec->record = btf_record_dup(htab->map.record); + hrec->key_size = map->key_size; + hrec->record = btf_record_dup(map->record); if (IS_ERR(hrec->record)) { err = PTR_ERR(hrec->record); kfree(hrec); return err; } - ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma; bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec); return 0; } @@ -534,9 +537,9 @@ static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf, * populated in htab_map_alloc(), so it will always appear as NULL. */ if (htab_is_percpu(htab)) - return htab_set_dtor(htab, htab_pcpu_mem_dtor); + return bpf_ma_set_dtor(map, &htab->pcpu_ma, htab_pcpu_mem_dtor); else - return htab_set_dtor(htab, htab_mem_dtor); + return bpf_ma_set_dtor(map, &htab->ma, htab_mem_dtor); } static struct bpf_map *htab_map_alloc(union bpf_attr *attr) @@ -834,8 +837,8 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, return insn - insn_buf; } -static void check_and_free_fields(struct bpf_htab *htab, - struct htab_elem *elem) +static void check_and_cancel_fields(struct bpf_htab *htab, + struct htab_elem *elem) { if (IS_ERR_OR_NULL(htab->map.record)) return; @@ -845,11 +848,11 @@ static void check_and_free_fields(struct bpf_htab *htab, int cpu; for_each_possible_cpu(cpu) - bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu)); + bpf_obj_cancel_fields(&htab->map, per_cpu_ptr(pptr, cpu)); } else { void *map_value = htab_elem_value(elem, htab->map.key_size); - bpf_obj_free_fields(htab->map.record, map_value); + bpf_obj_cancel_fields(&htab->map, map_value); } } @@ -884,7 +887,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) htab_unlock_bucket(b, flags); if (l == tgt_l) - check_and_free_fields(htab, l); + check_and_cancel_fields(htab, l); return l == tgt_l; } @@ -949,7 +952,7 @@ find_first_elem: static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { - check_and_free_fields(htab, l); + check_and_cancel_fields(htab, l); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); @@ -1002,7 +1005,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) if (htab_is_prealloc(htab)) { bpf_map_dec_elem_count(&htab->map); - check_and_free_fields(htab, l); + check_and_cancel_fields(htab, l); pcpu_freelist_push(&htab->freelist, &l->fnode); } else { dec_elem_count(htab); @@ -1019,7 +1022,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, /* copy true value_size bytes */ ptr = this_cpu_ptr(pptr); copy_map_value(&htab->map, ptr, value); - bpf_obj_free_fields(htab->map.record, ptr); + bpf_obj_cancel_fields(&htab->map, ptr); } else { u32 size = round_up(htab->map.value_size, 8); void *val; @@ -1029,7 +1032,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, cpu = map_flags >> 32; ptr = per_cpu_ptr(pptr, cpu); copy_map_value(&htab->map, ptr, value); - bpf_obj_free_fields(htab->map.record, ptr); + bpf_obj_cancel_fields(&htab->map, ptr); return; } @@ -1037,7 +1040,7 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, ptr = per_cpu_ptr(pptr, cpu); val = (map_flags & BPF_F_ALL_CPUS) ? value : value + size * cpu; copy_map_value(&htab->map, ptr, val); - bpf_obj_free_fields(htab->map.record, ptr); + bpf_obj_cancel_fields(&htab->map, ptr); } } } @@ -1253,11 +1256,11 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (l_old) { hlist_nulls_del_rcu(&l_old->hash_node); - /* l_old has already been stashed in htab->extra_elems, free - * its special fields before it is available for reuse. + /* l_old has already been stashed in htab->extra_elems, cancel + * its reusable special fields before it is available for reuse. */ if (htab_is_prealloc(htab)) - check_and_free_fields(htab, l_old); + check_and_cancel_fields(htab, l_old); } htab_unlock_bucket(b, flags); if (l_old && !htab_is_prealloc(htab)) @@ -1270,7 +1273,7 @@ err: static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) { - check_and_free_fields(htab, elem); + check_and_cancel_fields(htab, elem); bpf_map_dec_elem_count(&htab->map); bpf_lru_push_free(&htab->lru, &elem->lru_node); } @@ -2739,3 +2742,794 @@ const struct bpf_map_ops htab_of_maps_map_ops = { BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], }; + +struct rhtab_elem { + struct rhash_head node; + /* key bytes, then value bytes follow */ + u8 data[] __aligned(8); +}; + +struct bpf_rhtab { + struct bpf_map map; + struct rhashtable ht; + struct bpf_mem_alloc ma; + u32 elem_size; + bool freeing_internal; +}; + +static const struct rhashtable_params rhtab_params = { + .head_offset = offsetof(struct rhtab_elem, node), + .key_offset = offsetof(struct rhtab_elem, data), +}; + +static inline void *rhtab_elem_value(struct rhtab_elem *l, u32 key_size) +{ + return l->data + round_up(key_size, 8); +} + +/* Specialize hash function and objcmp for long sized key */ +static __always_inline int rhtab_key_cmp_long(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const unsigned long key1 = *(const unsigned long *)arg->key; + const struct rhtab_elem *key2 = ptr; + + return key1 != *(const unsigned long *)key2->data; +} + +static __always_inline u32 rhtab_hashfn_long(const void *data, u32 len, u32 seed) +{ + u64 k = *(const unsigned long *)data; + + return (u32)(k ^ (k >> 32)) ^ seed; +} + +static const struct rhashtable_params rhtab_params_long = { + .head_offset = offsetof(struct rhtab_elem, node), + .key_offset = offsetof(struct rhtab_elem, data), + .key_len = sizeof(long), + .hashfn = rhtab_hashfn_long, + .obj_cmpfn = rhtab_key_cmp_long, +}; + +static struct bpf_map *rhtab_map_alloc(union bpf_attr *attr) +{ + struct rhashtable_params params; + struct bpf_rhtab *rhtab; + int err = 0; + + rhtab = bpf_map_area_alloc(sizeof(*rhtab), NUMA_NO_NODE); + if (!rhtab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&rhtab->map, attr); + + if (rhtab->map.max_entries > 1UL << 31) { + err = -E2BIG; + goto free_rhtab; + } + + rhtab->elem_size = sizeof(struct rhtab_elem) + round_up(rhtab->map.key_size, 8) + + round_up(rhtab->map.value_size, 8); + + params = rhtab_params; + params.key_len = rhtab->map.key_size; + params.nelem_hint = (u32)attr->map_extra; + params.automatic_shrinking = true; + + if (rhtab->map.key_size == sizeof(long)) { + params.hashfn = rhtab_hashfn_long; + params.obj_cmpfn = rhtab_key_cmp_long; + } + + err = rhashtable_init(&rhtab->ht, ¶ms); + if (err) + goto free_rhtab; + + /* Set max_elems after rhashtable_init() since init zeroes the struct */ + rhtab->ht.max_elems = rhtab->map.max_entries; + + err = bpf_mem_alloc_init(&rhtab->ma, rhtab->elem_size, false); + if (err) + goto destroy_rhtab; + + return &rhtab->map; + +destroy_rhtab: + rhashtable_destroy(&rhtab->ht); +free_rhtab: + bpf_map_area_free(rhtab); + return ERR_PTR(err); +} + +static int rhtab_map_alloc_check(union bpf_attr *attr) +{ + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) + return -EINVAL; + + if (attr->map_flags & BPF_F_ZERO_SEED) + return -EINVAL; + + if (attr->key_size > U16_MAX) + return -E2BIG; + + if (attr->map_extra >> 32) + return -EINVAL; + + if ((u32)attr->map_extra > U16_MAX) + return -E2BIG; + + if ((u32)attr->map_extra > attr->max_entries) + return -EINVAL; + + return htab_map_alloc_check(attr); +} + +static void rhtab_check_and_free_fields(struct bpf_rhtab *rhtab, + struct rhtab_elem *elem) +{ + if (IS_ERR_OR_NULL(rhtab->map.record)) + return; + + bpf_obj_free_fields(rhtab->map.record, + rhtab_elem_value(elem, rhtab->map.key_size)); +} + +static void rhtab_mem_dtor(void *obj, void *ctx) +{ + struct htab_btf_record *hrec = ctx; + struct rhtab_elem *elem = obj; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + bpf_obj_free_fields(hrec->record, + rhtab_elem_value(elem, hrec->key_size)); +} + +static void rhtab_free_elem(void *ptr, void *arg) +{ + struct bpf_rhtab *rhtab = arg; + struct rhtab_elem *elem = ptr; + + bpf_map_free_internal_structs(&rhtab->map, rhtab_elem_value(elem, rhtab->map.key_size)); + bpf_mem_cache_free_rcu(&rhtab->ma, elem); +} + +static void rhtab_map_free(struct bpf_map *map) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + rhashtable_free_and_destroy(&rhtab->ht, rhtab_free_elem, rhtab); + bpf_mem_alloc_destroy(&rhtab->ma); + bpf_map_area_free(rhtab); +} + +static void *rhtab_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + /* Hold RCU lock in case sleepable program calls via gen_lookup */ + guard(rcu)(); + + if (map->key_size == sizeof(long)) + return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params_long); + + return rhashtable_lookup_likely(&rhtab->ht, key, rhtab_params); +} + +static void *rhtab_map_lookup_elem(struct bpf_map *map, void *key) __must_hold(RCU) +{ + struct rhtab_elem *l; + + l = rhtab_lookup_elem(map, key); + return l ? rhtab_elem_value(l, map->key_size) : NULL; +} + +static void rhtab_read_elem_value(struct bpf_map *map, void *dst, struct rhtab_elem *elem, + u64 flags) +{ + void *src = rhtab_elem_value(elem, map->key_size); + + if (flags & BPF_F_LOCK) + copy_map_value_locked(map, dst, src, true); + else + copy_map_value(map, dst, src); +} + +static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, void *copy, + u64 flags) +{ + int err; + + /* + * disable_instrumentation() mitigates the deadlock for programs running in NMI context. + * rhashtable locks bucket with local_irq_save(). Only NMI programs may reenter + * rhashtable code, bpf_disable_instrumentation() disables programs running in NMI, except + * raw tracepoints, which we don't have in rhashtable. + */ + bpf_disable_instrumentation(); + + if (rhtab->map.key_size == sizeof(long)) + err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params_long); + else + err = rhashtable_remove_fast(&rhtab->ht, &elem->node, rhtab_params); + + bpf_enable_instrumentation(); + + if (err) + return err; + + if (copy) { + rhtab_read_elem_value(&rhtab->map, copy, elem, flags); + check_and_init_map_value(&rhtab->map, copy); + } + /* Release internal structs: kptr, bpf_timer, task_work, wq */ + rhtab_check_and_free_fields(rhtab, elem); + bpf_mem_cache_free_rcu(&rhtab->ma, elem); + return 0; +} + + +static long rhtab_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + + guard(rcu)(); + + elem = rhtab_lookup_elem(map, key); + if (!elem) + return -ENOENT; + + return rhtab_delete_elem(rhtab, elem, NULL, 0); +} + +static int rhtab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void *value, u64 flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + int err; + + err = bpf_map_check_op_flags(map, flags, BPF_F_LOCK); + if (err) + return err; + + guard(rcu)(); + + elem = rhtab_lookup_elem(map, key); + if (!elem) + return -ENOENT; + + return rhtab_delete_elem(rhtab, elem, value, flags); +} + +static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *elem, void *value, + u64 map_flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + void *old_val = rhtab_elem_value(elem, map->key_size); + + if (map_flags & BPF_NOEXIST) + return -EEXIST; + + if (map_flags & BPF_F_LOCK) + copy_map_value_locked(map, old_val, value, false); + else + copy_map_value(map, old_val, value); + + /* + * Torn reads: a concurrent reader without BPF_F_LOCK may observe + * the value mid-copy. Callers requiring consistent reads must use + * BPF_F_LOCK, matching arraymap semantics. + * + * copy_map_value() skips special-field offsets, so old timers/ + * kptrs/etc. still sit in the slot. Cancel them after the copy + * to match arraymap's update semantics. + */ + rhtab_check_and_free_fields(rhtab, elem); + return 0; +} + +static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem, *tmp; + + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) + return -EINVAL; + + if ((map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) + return -EINVAL; + + guard(rcu)(); + elem = rhtab_lookup_elem(map, key); + if (elem) + return rhtab_map_update_existing(map, elem, value, map_flags); + + if (map_flags & BPF_EXIST) + return -ENOENT; + + /* + * Reject new insertions while map_release_uref cleanup walks the + * table. Without this, new elements could keep triggering rehash + * and prevent the walk from terminating. + */ + if (READ_ONCE(rhtab->freeing_internal)) + return -EBUSY; + + /* Check max_entries limit before inserting new element */ + if (atomic_read(&rhtab->ht.nelems) >= map->max_entries) + return -E2BIG; + + elem = bpf_mem_cache_alloc(&rhtab->ma); + if (!elem) + return -ENOMEM; + + memcpy(elem->data, key, map->key_size); + copy_map_value(map, rhtab_elem_value(elem, map->key_size), value); + check_and_init_map_value(map, rhtab_elem_value(elem, map->key_size)); + + /* Prevent deadlock for NMI programs attempting to take bucket lock */ + bpf_disable_instrumentation(); + + if (map->key_size == sizeof(long)) + tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params_long); + else + tmp = rhashtable_lookup_get_insert_fast(&rhtab->ht, &elem->node, rhtab_params); + + bpf_enable_instrumentation(); + + if (tmp) { + bpf_mem_cache_free(&rhtab->ma, elem); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + return rhtab_map_update_existing(map, tmp, value, map_flags); + } + + return 0; +} + +static int rhtab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + BUILD_BUG_ON(!__same_type(&rhtab_lookup_elem, + (void *(*)(struct bpf_map *map, void *key)) NULL)); + *insn++ = BPF_EMIT_CALL(rhtab_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct rhtab_elem, data) + round_up(map->key_size, 8)); + + return insn - insn_buf; +} + +static int rhtab_map_check_btf(struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + + return bpf_ma_set_dtor(map, &rhtab->ma, rhtab_mem_dtor); +} + +static void rhtab_map_free_internal_structs(struct bpf_map *map) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhashtable_iter iter; + struct rhtab_elem *elem; + + if (!bpf_map_has_internal_structs(map)) + return; + + /* + * Block new insertions. Once observed, no new growth is triggered, + * so any in-flight rehash will drain and the walker is guaranteed + * to stop returning -EAGAIN. Treat -EAGAIN as "rehash in progress, + * retry"; do not wait for the worker. + */ + WRITE_ONCE(rhtab->freeing_internal, true); + + rhashtable_walk_enter(&rhtab->ht, &iter); + rhashtable_walk_start(&iter); + + while ((elem = rhashtable_walk_next(&iter))) { + if (IS_ERR(elem)) { + if (PTR_ERR(elem) == -EAGAIN) + continue; + break; + } + + bpf_map_free_internal_structs(map, rhtab_elem_value(elem, map->key_size)); + + if (need_resched()) { /* Avoid stalls on large maps */ + rhashtable_walk_stop(&iter); + cond_resched(); + rhashtable_walk_start(&iter); + } + } + + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + WRITE_ONCE(rhtab->freeing_internal, false); +} + +static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) + __must_hold_shared(RCU) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + struct rhtab_elem *elem; + + elem = rhashtable_next_key(&rhtab->ht, key); + + /* if not found, return the first key */ + if (PTR_ERR(elem) == -ENOENT) + elem = rhashtable_next_key(&rhtab->ht, NULL); + + if (IS_ERR(elem)) + return PTR_ERR(elem); + if (!elem) + return -ENOENT; + + memcpy(next_key, elem->data, map->key_size); + return 0; +} + +static void rhtab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) +{ + void *value; + + /* Guarantee that hashtab value is not freed */ + guard(rcu)(); + + value = rhtab_map_lookup_elem(map, key); + if (!value) + return; + + btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); + seq_puts(m, ": "); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_putc(m, '\n'); +} + +static long bpf_each_rhash_elem(struct bpf_map *map, bpf_callback_t callback_fn, + void *callback_ctx, u64 flags) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + void *prev_key = NULL; + struct rhtab_elem *elem; + int num_elems = 0; + u64 ret = 0; + + cant_migrate(); + + if (flags != 0) + return -EINVAL; + + rcu_read_lock(); + /* + * Best-effort iteration: if rhashtable is concurrently resized or + * elements are deleted/inserted, there may be missed or duplicate + * elements visited. + */ + while ((elem = rhashtable_next_key(&rhtab->ht, prev_key))) { + if (IS_ERR(elem)) + break; + num_elems++; + ret = callback_fn((u64)(long)map, + (u64)(long)elem->data, + (u64)(long)rhtab_elem_value(elem, map->key_size), + (u64)(long)callback_ctx, 0); + if (ret) + break; + + prev_key = elem->data; /* valid while RCU held */ + } + rcu_read_unlock(); + + return num_elems; +} + +static u64 rhtab_map_mem_usage(const struct bpf_map *map) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + u64 num_entries; + + /* Excludes rhashtable bucket overhead (~ nelems * sizeof(void *) at 75% load). */ + num_entries = atomic_read(&rhtab->ht.nelems); + return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries; +} + +static int __rhtab_map_lookup_and_delete_batch(struct bpf_map *map, + const union bpf_attr *attr, + union bpf_attr __user *uattr, + bool do_delete) +{ + struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map); + void __user *uvalues = u64_to_user_ptr(attr->batch.values); + void __user *ukeys = u64_to_user_ptr(attr->batch.keys); + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void *cursor = NULL, *keys = NULL, *values = NULL, *dst_key, *dst_val; + struct rhtab_elem **del_elems = NULL; + u32 max_count, total, key_size, value_size, i; + bool has_next_cursor = false; + struct rhtab_elem *elem; + u64 elem_map_flags, map_flags; + int ret = 0; + + elem_map_flags = attr->batch.elem_flags; + ret = bpf_map_check_op_flags(map, elem_map_flags, BPF_F_LOCK); + if (ret) + return ret; + + map_flags = attr->batch.flags; + if (map_flags) + return -EINVAL; + + max_count = attr->batch.count; + if (!max_count) + return 0; + + if (put_user(0, &uattr->batch.count)) + return -EFAULT; + + key_size = map->key_size; + value_size = map->value_size; + + keys = kvmalloc_array(max_count, key_size, GFP_USER | __GFP_NOWARN); + values = kvmalloc_array(max_count, value_size, GFP_USER | __GFP_NOWARN); + if (do_delete) + del_elems = kvmalloc_array(max_count, sizeof(void *), + GFP_USER | __GFP_NOWARN); + cursor = kmalloc(key_size, GFP_USER | __GFP_NOWARN); + + if (!keys || !values || !cursor || (do_delete && !del_elems)) { + ret = -ENOMEM; + goto free; + } + + if (ubatch && copy_from_user(cursor, ubatch, key_size)) { + ret = -EFAULT; + goto free; + } + + dst_key = keys; + dst_val = values; + total = 0; + + rcu_read_lock(); + + /* + * Cursor stores the key of the next-to-process element (stashed by + * the previous batch). Look it up directly so the element is included + * here rather than skipped by next_key(). If the cursor was deleted + * concurrently (or by the previous do_delete batch), return -EAGAIN + * so userspace can distinguish a lost cursor from end-of-iteration + * (-ENOENT) and restart from a NULL cursor. + */ + if (ubatch) { + elem = rhtab_lookup_elem(map, cursor); + if (!elem) { + rcu_read_unlock(); + ret = -EAGAIN; + goto free; + } + } else { + elem = rhashtable_next_key(&rhtab->ht, NULL); + } + + while (elem && !IS_ERR(elem) && total < max_count) { + memcpy(dst_key, elem->data, key_size); + rhtab_read_elem_value(map, dst_val, elem, elem_map_flags); + check_and_init_map_value(map, dst_val); + + if (do_delete) + del_elems[total] = elem; + + elem = rhashtable_next_key(&rhtab->ht, dst_key); + dst_key += key_size; + dst_val += value_size; + total++; + + /* Bail to userspace to avoid stalls. */ + if (need_resched()) + break; + } + + if (elem && !IS_ERR(elem)) { + /* Stash next-to-process key as cursor for the next batch. */ + memcpy(cursor, elem->data, key_size); + has_next_cursor = true; + } + + if (do_delete) { + for (i = 0; i < total; i++) + rhtab_delete_elem(rhtab, del_elems[i], NULL, 0); + } + + rcu_read_unlock(); + + if (total == 0) { + ret = -ENOENT; + goto free; + } + + /* No more elements after this batch. */ + if (!has_next_cursor) + ret = -ENOENT; + + if (copy_to_user(ukeys, keys, (size_t)total * key_size) || + copy_to_user(uvalues, values, (size_t)total * value_size) || + put_user(total, &uattr->batch.count) || + (has_next_cursor && + copy_to_user(u64_to_user_ptr(attr->batch.out_batch), + cursor, key_size))) { + ret = -EFAULT; + goto free; + } + +free: + kfree(cursor); + kvfree(keys); + kvfree(values); + kvfree(del_elems); + return ret; +} + +static int rhtab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, false); +} + +static int rhtab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, true); +} + +struct bpf_iter_seq_rhash_map_info { + struct bpf_map *map; + struct bpf_rhtab *rhtab; + struct rhashtable_iter iter; +}; + +static void *bpf_rhash_map_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct rhtab_elem *elem; + + rhashtable_walk_start(&info->iter); + /* + * Re-deliver the element returned by walk_next() at the end of the + * previous read() — bpf_seq_read may have stopped before show() + * consumed it. Rehash rewinds the walker; retry on -EAGAIN. + */ + do { + elem = rhashtable_walk_peek(&info->iter); + } while (PTR_ERR(elem) == -EAGAIN); + + if (IS_ERR(elem)) + return NULL; + + if (elem && *pos == 0) + ++*pos; + return elem; +} + +static void *bpf_rhash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct rhtab_elem *elem; + + ++*pos; + + /* Rehash rewinds the walker; retry until it stops returning -EAGAIN. */ + do { + elem = rhashtable_walk_next(&info->iter); + } while (PTR_ERR(elem) == -EAGAIN); + + if (IS_ERR(elem)) + return NULL; + return elem; +} + +static int __bpf_rhash_map_seq_show(struct seq_file *seq, + struct rhtab_elem *elem) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + struct bpf_iter__bpf_map_elem ctx = {}; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, elem == NULL); + if (prog) { + ctx.meta = &meta; + ctx.map = info->map; + if (elem) { + ctx.key = elem->data; + ctx.value = rhtab_elem_value(elem, info->map->key_size); + } + ret = bpf_iter_run_prog(prog, &ctx); + } + + return ret; +} + +static int bpf_rhash_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_rhash_map_seq_show(seq, v); +} + +static void bpf_rhash_map_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + struct bpf_iter_seq_rhash_map_info *info = seq->private; + + if (!v) + (void)__bpf_rhash_map_seq_show(seq, NULL); + + rhashtable_walk_stop(&info->iter); +} + +static int bpf_iter_init_rhash_map(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct bpf_iter_seq_rhash_map_info *info = priv_data; + struct bpf_map *map = aux->map; + + bpf_map_inc_with_uref(map); + info->map = map; + info->rhtab = container_of(map, struct bpf_rhtab, map); + rhashtable_walk_enter(&info->rhtab->ht, &info->iter); + return 0; +} + +static void bpf_iter_fini_rhash_map(void *priv_data) +{ + struct bpf_iter_seq_rhash_map_info *info = priv_data; + + rhashtable_walk_exit(&info->iter); + bpf_map_put_with_uref(info->map); +} + +static const struct seq_operations bpf_rhash_map_seq_ops = { + .start = bpf_rhash_map_seq_start, + .next = bpf_rhash_map_seq_next, + .stop = bpf_rhash_map_seq_stop, + .show = bpf_rhash_map_seq_show, +}; + +static const struct bpf_iter_seq_info rhash_iter_seq_info = { + .seq_ops = &bpf_rhash_map_seq_ops, + .init_seq_private = bpf_iter_init_rhash_map, + .fini_seq_private = bpf_iter_fini_rhash_map, + .seq_priv_size = sizeof(struct bpf_iter_seq_rhash_map_info), +}; + +BTF_ID_LIST_SINGLE(rhtab_map_btf_ids, struct, bpf_rhtab) +const struct bpf_map_ops rhtab_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = rhtab_map_alloc_check, + .map_alloc = rhtab_map_alloc, + .map_free = rhtab_map_free, + .map_get_next_key = rhtab_map_get_next_key, + .map_release_uref = rhtab_map_free_internal_structs, + .map_check_btf = rhtab_map_check_btf, + .map_lookup_elem = rhtab_map_lookup_elem, + .map_lookup_and_delete_elem = rhtab_map_lookup_and_delete_elem, + .map_update_elem = rhtab_map_update_elem, + .map_delete_elem = rhtab_map_delete_elem, + .map_gen_lookup = rhtab_map_gen_lookup, + .map_seq_show_elem = rhtab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_each_rhash_elem, + .map_mem_usage = rhtab_map_mem_usage, + BATCH_OPS(rhtab), + .map_btf_id = &rhtab_map_btf_ids[0], + .iter_seq_info = &rhash_iter_seq_info, +}; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2bb60200c266..8e196c9b7c50 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1944,7 +1944,7 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, + .arg3_type = ARG_PTR_TO_DYNPTR, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; @@ -2001,7 +2001,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, + .arg1_type = ARG_PTR_TO_DYNPTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, @@ -2044,7 +2044,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, - .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, + .arg1_type = ARG_PTR_TO_DYNPTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; @@ -2247,10 +2247,11 @@ EXPORT_SYMBOL_GPL(bpf_base_func_proto); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock) { - struct list_head *head = list_head, *orig_head = list_head; + struct list_head *head = list_head, drain, *pos, *n; BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); + INIT_LIST_HEAD(&drain); /* Do the actual list draining outside the lock to not hold the lock for * too long, and also prevent deadlocks if tracing programs end up @@ -2261,20 +2262,30 @@ void bpf_list_head_free(const struct btf_field *field, void *list_head, __bpf_spin_lock_irqsave(spin_lock); if (!head->next || list_empty(head)) goto unlock; - head = head->next; + list_for_each_safe(pos, n, head) { + struct bpf_list_node_kern *node; + + node = container_of(pos, struct bpf_list_node_kern, list_head); + WRITE_ONCE(node->owner, BPF_PTR_POISON); + list_move_tail(pos, &drain); + } unlock: - INIT_LIST_HEAD(orig_head); + INIT_LIST_HEAD(head); __bpf_spin_unlock_irqrestore(spin_lock); - while (head != orig_head) { - void *obj = head; + while (!list_empty(&drain)) { + struct bpf_list_node_kern *node; - obj -= field->graph_root.node_offset; - head = head->next; + pos = drain.next; + node = container_of(pos, struct bpf_list_node_kern, list_head); + list_del_init(pos); + /* Ensure __bpf_list_add() sees the node as unlinked. */ + smp_store_release(&node->owner, NULL); /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ - __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); + __bpf_obj_drop_impl((char *)pos - field->graph_root.node_offset, + field->graph_root.value_rec, false); } } @@ -2295,6 +2306,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, struct bpf_spin_lock *spin_lock) { struct rb_root_cached orig_root, *root = rb_root; + struct bpf_rb_node_kern *node; struct rb_node *pos, *n; void *obj; @@ -2303,14 +2315,20 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, __bpf_spin_lock_irqsave(spin_lock); orig_root = *root; + bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { + node = rb_entry(pos, struct bpf_rb_node_kern, rb_node); + WRITE_ONCE(node->owner, BPF_PTR_POISON); + } *root = RB_ROOT_CACHED; __bpf_spin_unlock_irqrestore(spin_lock); bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { obj = pos; obj -= field->graph_root.node_offset; - - + node = rb_entry(pos, struct bpf_rb_node_kern, rb_node); + RB_CLEAR_NODE(pos); + /* Ensure __bpf_rbtree_add() sees the node as unlinked. */ + smp_store_release(&node->owner, NULL); __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); } } @@ -2467,9 +2485,11 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta static int __bpf_list_add(struct bpf_list_node_kern *node, struct bpf_list_head *head, - bool tail, struct btf_record *rec, u64 off) + struct list_head **prev_ptr, + struct btf_record *rec, u64 off) { struct list_head *n = &node->list_head, *h = (void *)head; + struct list_head *prev; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here @@ -2477,19 +2497,31 @@ static int __bpf_list_add(struct bpf_list_node_kern *node, if (unlikely(!h->next)) INIT_LIST_HEAD(h); + prev = *prev_ptr; + + /* When prev is not the list head, it must be a node in this list. */ + if (prev != h) { + struct bpf_list_node_kern *prev_kn = + container_of(prev, struct bpf_list_node_kern, list_head); + + if (unlikely(READ_ONCE(prev_kn->owner) != head)) + goto fail; + } + /* node->owner != NULL implies !list_empty(n), no need to separately * check the latter */ - if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) { - /* Only called from BPF prog, no need to migrate_disable */ - __bpf_obj_drop_impl((void *)n - off, rec, false); - return -EINVAL; - } + if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) + goto fail; - tail ? list_add_tail(n, h) : list_add(n, h); + list_add(n, prev); WRITE_ONCE(node->owner, head); - return 0; + +fail: + /* Only called from BPF prog, no need to migrate_disable */ + __bpf_obj_drop_impl((void *)n - off, rec, false); + return -EINVAL; } /** @@ -2510,8 +2542,9 @@ __bpf_kfunc int bpf_list_push_front(struct bpf_list_head *head, u64 off) { struct bpf_list_node_kern *n = (void *)node; + struct list_head *h = (void *)head; - return __bpf_list_add(n, head, false, meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, &h, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head, @@ -2539,8 +2572,9 @@ __bpf_kfunc int bpf_list_push_back(struct bpf_list_head *head, u64 off) { struct bpf_list_node_kern *n = (void *)node; + struct list_head *h = (void *)head; - return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off); + return __bpf_list_add(n, head, &h->prev, meta ? meta->record : NULL, off); } __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, @@ -2550,37 +2584,63 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head, return bpf_list_push_back(head, node, meta__ign, off); } -static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) +__bpf_kfunc int bpf_list_add(struct bpf_list_head *head, struct bpf_list_node *new, + struct bpf_list_node *prev__nonown_allowed, + struct btf_struct_meta *meta, u64 off) +{ + struct bpf_list_node_kern *n = (void *)new, *p = (void *)prev__nonown_allowed; + struct list_head *prev_ptr = &p->list_head; + + return __bpf_list_add(n, head, &prev_ptr, meta ? meta->record : NULL, off); +} + +static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, + struct list_head *n) { - struct list_head *n, *h = (void *)head; + struct list_head *h = (void *)head; struct bpf_list_node_kern *node; /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't * called on its fields, so init here */ - if (unlikely(!h->next)) + if (unlikely(!h->next)) { INIT_LIST_HEAD(h); + return NULL; + } if (list_empty(h)) return NULL; - n = tail ? h->prev : h->next; node = container_of(n, struct bpf_list_node_kern, list_head); - if (WARN_ON_ONCE(READ_ONCE(node->owner) != head)) + if (unlikely(READ_ONCE(node->owner) != head)) return NULL; list_del_init(n); - WRITE_ONCE(node->owner, NULL); + /* Ensure __bpf_list_add() sees the node as unlinked. */ + smp_store_release(&node->owner, NULL); return (struct bpf_list_node *)n; } __bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) { - return __bpf_list_del(head, false); + struct list_head *h = (void *)head; + + return __bpf_list_del(head, h->next); } __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) { - return __bpf_list_del(head, true); + struct list_head *h = (void *)head; + + return __bpf_list_del(head, h->prev); +} + +__bpf_kfunc struct bpf_list_node *bpf_list_del(struct bpf_list_head *head, + struct bpf_list_node *node__nonown_allowed) +{ + struct bpf_list_node_kern *kn = (void *)node__nonown_allowed; + + /* verifier guarantees node is a list node rather than list head */ + return __bpf_list_del(head, &kn->list_head); } __bpf_kfunc struct bpf_list_node *bpf_list_front(struct bpf_list_head *head) @@ -2603,6 +2663,43 @@ __bpf_kfunc struct bpf_list_node *bpf_list_back(struct bpf_list_head *head) return (struct bpf_list_node *)h->prev; } +__bpf_kfunc bool bpf_list_is_first(struct bpf_list_head *head, + struct bpf_list_node *node__nonown_allowed) +{ + struct list_head *h = (struct list_head *)head; + struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed; + + if (READ_ONCE(kn->owner) != head) + return false; + + return list_is_first(&kn->list_head, h); +} + +__bpf_kfunc bool bpf_list_is_last(struct bpf_list_head *head, + struct bpf_list_node *node__nonown_allowed) +{ + struct list_head *h = (struct list_head *)head; + struct bpf_list_node_kern *kn = (struct bpf_list_node_kern *)node__nonown_allowed; + + if (READ_ONCE(kn->owner) != head) + return false; + + return list_is_last(&kn->list_head, h); +} + +__bpf_kfunc bool bpf_list_empty(struct bpf_list_head *head) +{ + struct list_head *h = (struct list_head *)head; + + /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't + * called on its fields, so init here + */ + if (unlikely(!h->next)) + INIT_LIST_HEAD(h); + + return list_empty(h); +} + __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, struct bpf_rb_node *node) { @@ -2912,11 +3009,13 @@ __bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid) { struct task_struct *p; - rcu_read_lock(); + guard(rcu)(); + if (!task_active_pid_ns(current)) + return NULL; + p = find_task_by_vpid(vpid); if (p) p = bpf_task_acquire(p); - rcu_read_unlock(); return p; } @@ -3072,7 +3171,7 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, return bpf_dynptr_slice(p, offset, buffer__nullable, buffer__szk); } -__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end) +__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr *p, u64 start, u64 end) { struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u64 size; @@ -3093,14 +3192,14 @@ __bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u64 start, u64 end __bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; return !ptr->data; } __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return false; @@ -3110,7 +3209,7 @@ __bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p) __bpf_kfunc u64 bpf_dynptr_size(const struct bpf_dynptr *p) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) return -EINVAL; @@ -3122,7 +3221,7 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, struct bpf_dynptr *clone__uninit) { struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit; - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; if (!ptr->data) { bpf_dynptr_set_null(clone); @@ -3145,11 +3244,11 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, * Copies data from source dynptr to destination dynptr. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, - struct bpf_dynptr *src_ptr, u64 src_off, u64 size) +__bpf_kfunc int bpf_dynptr_copy(const struct bpf_dynptr *dst_ptr, u64 dst_off, + const struct bpf_dynptr *src_ptr, u64 src_off, u64 size) { - struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; - struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; + const struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; + const struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; void *src_slice, *dst_slice; char buf[256]; u64 off; @@ -3200,9 +3299,9 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u64 dst_off, * at @offset with the constant byte @val. * Returns 0 on success; negative error, otherwise. */ -__bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u64 offset, u64 size, u8 val) +__bpf_kfunc int bpf_dynptr_memset(const struct bpf_dynptr *p, u64 offset, u64 size, u8 val) { - struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; u64 chunk_sz, write_off; char buf[256]; void* slice; @@ -3301,7 +3400,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) * which skips compiler generated instrumentation to do the same. */ kasan_unpoison_task_stack_below((void *)(long)ctx.sp); - ctx.aux->bpf_exception_cb(cookie, ctx.sp, ctx.bp, 0, 0); + ctx.aux->bpf_exception_cb(cookie, ctx.sp + ctx.aux->stack_arg_sp_adjust, ctx.bp, 0, 0); WARN(1, "A call to BPF exception callback should never return\n"); } @@ -4214,13 +4313,13 @@ __bpf_kfunc void bpf_key_put(struct bpf_key *bkey) * * Return: 0 on success, a negative value on error. */ -__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, +__bpf_kfunc int bpf_verify_pkcs7_signature(const struct bpf_dynptr *data_p, + const struct bpf_dynptr *sig_p, struct bpf_key *trusted_keyring) { #ifdef CONFIG_SYSTEM_DATA_VERIFICATION - struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; - struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; + const struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; + const struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; const void *data, *sig; u32 data_len, sig_len; int ret; @@ -4241,8 +4340,13 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, data_len = __bpf_dynptr_size(data_ptr); data = __bpf_dynptr_data(data_ptr, data_len); + if (!data) + return -EINVAL; + sig_len = __bpf_dynptr_size(sig_ptr); sig = __bpf_dynptr_data(sig_ptr, sig_len); + if (!sig) + return -EINVAL; return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, @@ -4713,10 +4817,15 @@ BTF_ID_FLAGS(func, bpf_list_push_front, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_front_impl) BTF_ID_FLAGS(func, bpf_list_push_back, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_push_back_impl) +BTF_ID_FLAGS(func, bpf_list_add, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_del, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_front, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_back, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_is_first) +BTF_ID_FLAGS(func, bpf_list_is_last) +BTF_ID_FLAGS(func, bpf_list_empty) BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL) @@ -4857,7 +4966,7 @@ BTF_ID_FLAGS(func, bpf_stream_print_stack, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, bpf_dynptr_from_file) -BTF_ID_FLAGS(func, bpf_dynptr_file_discard) +BTF_ID_FLAGS(func, bpf_dynptr_file_discard, KF_RELEASE) BTF_ID_FLAGS(func, bpf_timer_cancel_async) BTF_KFUNCS_END(common_btf_ids) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 25c06a011825..7837968c0842 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -21,6 +21,9 @@ #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/kstrtox.h> +#include <linux/xattr.h> +#include <linux/security.h> + #include "preload/bpf_preload.h" enum bpf_type { @@ -30,6 +33,23 @@ enum bpf_type { BPF_TYPE_LINK, }; +struct bpf_fs_inode { + struct list_head xattrs; + struct simple_xattr_limits xlimits; + struct inode vfs_inode; +}; + +static inline struct bpf_fs_inode *BPF_FS_I(struct inode *inode) +{ + return container_of(inode, struct bpf_fs_inode, vfs_inode); +} + +static struct kmem_cache *bpf_fs_inode_cachep __ro_after_init; + +static int bpf_fs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info); +static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size); + static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { @@ -94,10 +114,17 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) } static const struct inode_operations bpf_dir_iops; +static const struct inode_operations bpf_symlink_iops; -static const struct inode_operations bpf_prog_iops = { }; -static const struct inode_operations bpf_map_iops = { }; -static const struct inode_operations bpf_link_iops = { }; +static const struct inode_operations bpf_prog_iops = { + .listxattr = bpf_fs_listxattr, +}; +static const struct inode_operations bpf_map_iops = { + .listxattr = bpf_fs_listxattr, +}; +static const struct inode_operations bpf_link_iops = { + .listxattr = bpf_fs_listxattr, +}; struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, @@ -153,11 +180,19 @@ static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; + int ret; inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) return ERR_CAST(inode); + ret = security_inode_init_security(inode, dir, &dentry->d_name, + bpf_fs_initxattrs, NULL); + if (ret && ret != -EOPNOTSUPP) { + iput(inode); + return ERR_PTR(ret); + } + inode->i_op = &bpf_dir_iops; inode->i_fop = &simple_dir_operations; @@ -330,10 +365,20 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, const struct file_operations *fops) { struct inode *dir = dentry->d_parent->d_inode; - struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); + struct inode *inode; + int ret; + + inode = bpf_get_inode(dir->i_sb, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); + ret = security_inode_init_security(inode, dir, &dentry->d_name, + bpf_fs_initxattrs, NULL); + if (ret && ret != -EOPNOTSUPP) { + iput(inode); + return ret; + } + inode->i_op = iops; inode->i_fop = fops; inode->i_private = raw; @@ -382,9 +427,11 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *target) { - char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); struct inode *inode; + char *link; + int ret; + link = kstrdup(target, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!link) return -ENOMEM; @@ -394,13 +441,25 @@ static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, return PTR_ERR(inode); } - inode->i_op = &simple_symlink_inode_operations; + inode->i_op = &bpf_symlink_iops; inode->i_link = link; + ret = security_inode_init_security(inode, dir, &dentry->d_name, + bpf_fs_initxattrs, NULL); + if (ret && ret != -EOPNOTSUPP) { + iput(inode); + return ret; + } + bpf_dentry_finalize(dentry, inode, dir); return 0; } +static const struct inode_operations bpf_symlink_iops = { + .get_link = simple_get_link, + .listxattr = bpf_fs_listxattr, +}; + static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, .mkdir = bpf_mkdir, @@ -409,6 +468,7 @@ static const struct inode_operations bpf_dir_iops = { .rename = simple_rename, .link = simple_link, .unlink = simple_unlink, + .listxattr = bpf_fs_listxattr, }; /* pin iterator link into bpffs */ @@ -762,22 +822,151 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } +static struct inode *bpf_fs_alloc_inode(struct super_block *sb) +{ + struct bpf_fs_inode *bi; + + bi = alloc_inode_sb(sb, bpf_fs_inode_cachep, GFP_KERNEL); + if (!bi) + return NULL; + INIT_LIST_HEAD_RCU(&bi->xattrs); + simple_xattr_limits_init(&bi->xlimits); + return &bi->vfs_inode; +} + static void bpf_destroy_inode(struct inode *inode) { + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); enum bpf_type type; - if (S_ISLNK(inode->i_mode)) - kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) bpf_any_put(inode->i_private, type); - free_inode_nonrcu(inode); + simple_xattrs_free(&opts->xa_cache, &bi->xattrs, NULL); +} + +/* + * Called after RCU grace period - safe to free inode and anything + * that might be accessed by RCU pathwalk (inode fields, i_link). + */ +static void bpf_free_inode(struct inode *inode) +{ + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); + kmem_cache_free(bpf_fs_inode_cachep, BPF_FS_I(inode)); +} + +static int bpf_fs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *value, size_t size) +{ + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); + + name = xattr_full_name(handler, name); + return simple_xattr_get(&opts->xa_cache, &bi->xattrs, name, value, size); +} + +enum { + BPF_FS_XATTR_UNSPEC, + BPF_FS_XATTR_SECURITY, + BPF_FS_XATTR_TRUSTED, +}; + +static int bpf_fs_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, struct dentry *unused, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); + struct simple_xattr *old; + int err = -EINVAL; + + name = xattr_full_name(handler, name); + switch (handler->flags) { + case BPF_FS_XATTR_SECURITY: + err = simple_xattr_set_limited(&opts->xa_cache, &bi->xattrs, + &bi->xlimits, name, value, size, + flags); + break; + case BPF_FS_XATTR_TRUSTED: + old = simple_xattr_set(&opts->xa_cache, &bi->xattrs, name, + value, size, flags); + err = IS_ERR(old) ? PTR_ERR(old) : 0; + if (!err) + simple_xattr_free_rcu(old); + break; + } + if (err) + return err; + inode_set_ctime_current(inode); + return 0; +} + +static const struct xattr_handler bpf_fs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = BPF_FS_XATTR_TRUSTED, + .get = bpf_fs_xattr_get, + .set = bpf_fs_xattr_set, +}; + +static const struct xattr_handler bpf_fs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = BPF_FS_XATTR_SECURITY, + .get = bpf_fs_xattr_get, + .set = bpf_fs_xattr_set, +}; + +static const struct xattr_handler * const bpf_fs_xattr_handlers[] = { + &bpf_fs_trusted_xattr_handler, + &bpf_fs_security_xattr_handler, + NULL, +}; + +static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size) +{ + struct inode *inode = d_inode(dentry); + + return simple_xattr_list(inode, &BPF_FS_I(inode)->xattrs, buf, size); +} + +static int bpf_fs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info) +{ + struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; + struct bpf_fs_inode *bi = BPF_FS_I(inode); + const struct xattr *xattr; + int err; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len); + if (IS_ERR(new_xattr)) + return PTR_ERR(new_xattr); + + new_xattr->name = kasprintf(GFP_KERNEL_ACCOUNT, + XATTR_SECURITY_PREFIX "%s", + xattr->name); + if (!new_xattr->name) + return -ENOMEM; + + err = simple_xattr_add_limited(&opts->xa_cache, &bi->xattrs, + &bi->xlimits, new_xattr); + if (err) + return err; + + retain_and_null_ptr(new_xattr); + } + return 0; } const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = inode_just_drop, .show_options = bpf_show_options, + .alloc_inode = bpf_fs_alloc_inode, .destroy_inode = bpf_destroy_inode, + .free_inode = bpf_free_inode, }; enum { @@ -996,25 +1185,38 @@ out: static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { - static const struct tree_descr bpf_rfiles[] = { { "" } }; struct bpf_mount_opts *opts = sb->s_fs_info; struct inode *inode; - int ret; /* Mounting an instance of BPF FS requires privileges */ if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN)) return -EPERM; - ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); - if (ret) - return ret; - + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_magic = BPF_FS_MAGIC; sb->s_op = &bpf_super_ops; + sb->s_xattr = bpf_fs_xattr_handlers; + sb->s_iflags |= SB_I_NOEXEC; + sb->s_iflags |= SB_I_NODEV; + sb->s_time_gran = 1; - inode = sb->s_root->d_inode; + inode = bpf_get_inode(sb, NULL, S_IFDIR | 0777); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + inode->i_ino = 1; + inode->i_op = &bpf_dir_iops; + inode->i_fop = &simple_dir_operations; + set_nlink(inode, 2); + + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + + inode = d_inode(sb->s_root); inode->i_uid = opts->uid; inode->i_gid = opts->gid; - inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; populate_bpffs(sb->s_root); inode->i_mode |= S_ISVTX | opts->mode; @@ -1068,6 +1270,7 @@ static void bpf_kill_super(struct super_block *sb) struct bpf_mount_opts *opts = sb->s_fs_info; kill_anon_super(sb); + simple_xattr_cache_cleanup(&opts->xa_cache); kfree(opts); } @@ -1080,18 +1283,37 @@ static struct file_system_type bpf_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; +static void bpf_fs_inode_init_once(void *foo) +{ + struct bpf_fs_inode *bi = foo; + + inode_init_once(&bi->vfs_inode); +} + static int __init bpf_init(void) { int ret; + bpf_fs_inode_cachep = kmem_cache_create("bpf_fs_inode_cache", + sizeof(struct bpf_fs_inode), + 0, SLAB_ACCOUNT, + bpf_fs_inode_init_once); + if (!bpf_fs_inode_cachep) + return -ENOMEM; + ret = sysfs_create_mount_point(fs_kobj, "bpf"); if (ret) - return ret; + goto out_cache; ret = register_filesystem(&bpf_fs_type); - if (ret) + if (ret) { sysfs_remove_mount_point(fs_kobj, "bpf"); + goto out_cache; + } + return 0; +out_cache: + kmem_cache_destroy(bpf_fs_inode_cachep); return ret; } fs_initcall(bpf_init); diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 332e6e003f27..0aadfbae0acc 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -610,6 +610,21 @@ enum arg_track_state { /* Track callee stack slots fp-8 through fp-512 (64 slots of 8 bytes each) */ #define MAX_ARG_SPILL_SLOTS 64 +/* + * Combined register + stack arg tracking: R0-R10 at indices 0-10, + * outgoing stack arg slots at indices MAX_BPF_REG..MAX_BPF_REG+6. + */ +#define MAX_AT_TRACK_REGS (MAX_BPF_REG + MAX_STACK_ARG_SLOTS) + +static int stack_arg_off_to_slot(s16 off) +{ + int aoff = off < 0 ? -off : off; + + if (aoff / 8 > MAX_STACK_ARG_SLOTS) + return -1; + return aoff / 8 - 1; +} + static bool arg_is_visited(const struct arg_track *at) { return at->frame != ARG_UNVISITED; @@ -791,7 +806,9 @@ static bool arg_track_join(struct bpf_verifier_env *env, int idx, int target, in return true; verbose(env, "arg JOIN insn %d -> %d ", idx, target); - if (r >= 0) + if (r >= MAX_BPF_REG) + verbose(env, "sa%d: ", r - MAX_BPF_REG); + else if (r >= 0) verbose(env, "r%d: ", r); else verbose(env, "fp%+d: ", r * 8); @@ -1032,6 +1049,21 @@ static void arg_track_log(struct bpf_verifier_env *env, struct bpf_insn *insn, i verbose(env, "\tr%d: ", i); verbose_arg_track(env, &at_in[i]); verbose(env, " -> "); verbose_arg_track(env, &at_out[i]); } + /* Log outgoing stack arg slot transitions at indices MAX_BPF_REG..MAX_AT_TRACK_REGS-1 */ + for (i = 0; i < MAX_STACK_ARG_SLOTS; i++) { + int ai = MAX_BPF_REG + i; + + if (arg_track_eq(&at_out[ai], &at_in[ai])) + continue; + if (!printed) { + verbose(env, "%3d: ", idx); + bpf_verbose_insn(env, insn); + bpf_vlog_reset(&env->log, env->log.end_pos - 1); + printed = true; + } + verbose(env, "\tsa%d: ", i); verbose_arg_track(env, &at_in[ai]); + verbose(env, " -> "); verbose_arg_track(env, &at_out[ai]); + } for (i = 0; i < MAX_ARG_SPILL_SLOTS; i++) { if (arg_track_eq(&at_stack_out[i], &at_stack_in[i])) continue; @@ -1062,6 +1094,7 @@ static bool can_be_local_fp(int depth, int regno, struct arg_track *at) static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn, int insn_idx, struct arg_track *at_out, struct arg_track *at_stack_out, + const struct arg_track *at_stack_arg_entry, struct func_instance *instance, u32 *callsites) { @@ -1071,9 +1104,21 @@ static void arg_track_xfer(struct bpf_verifier_env *env, struct bpf_insn *insn, struct arg_track *dst = &at_out[insn->dst_reg]; struct arg_track *src = &at_out[insn->src_reg]; struct arg_track none = { .frame = ARG_NONE }; - int r; - - if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) { + int r, slot; + + /* Handle stack arg stores and loads. */ + if (is_stack_arg_st(insn) || is_stack_arg_stx(insn)) { + slot = stack_arg_off_to_slot(insn->off); + if (slot >= 0) { + if (is_stack_arg_stx(insn)) + at_out[MAX_BPF_REG + slot] = at_out[insn->src_reg]; + else + at_out[MAX_BPF_REG + slot] = none; + } + } else if (is_stack_arg_ldx(insn)) { + slot = stack_arg_off_to_slot(insn->off); + at_out[insn->dst_reg] = (slot >= 0) ? at_stack_arg_entry[slot] : none; + } else if (class == BPF_ALU64 && BPF_SRC(insn->code) == BPF_K) { if (code == BPF_MOV) { *dst = none; } else if (dst->frame >= 0) { @@ -1297,6 +1342,16 @@ static int record_load_store_access(struct bpf_verifier_env *env, struct arg_track resolved, *ptr; int oi; + /* + * Stack arg insns use dst_reg/src_reg=BPF_REG_PARAMS(11). Since at[] + * is extended to MAX_AT_TRACK_REGS, at[11] holds the arg_track for + * outgoing stack arg slot 0 — not the pointer used for the memory + * access. Skip so the slot's tracked value isn't confused with the + * base register that record_stack_access() expects. + */ + if (is_stack_arg_stx(insn) || is_stack_arg_st(insn) || is_stack_arg_ldx(insn)) + return 0; + switch (class) { case BPF_LDX: ptr = &at[insn->src_reg]; @@ -1343,6 +1398,42 @@ static int record_load_store_access(struct bpf_verifier_env *env, return 0; } +static int record_arg_access(struct bpf_verifier_env *env, + struct func_instance *instance, + struct bpf_insn *insn, + struct arg_track *at, int arg_idx, + int insn_idx) +{ + int depth = instance->depth; + int frame = at->frame; + int err = 0; + s64 bytes; + + if (!arg_is_fp(at)) + return 0; + + if (bpf_helper_call(insn)) { + bytes = bpf_helper_stack_access_bytes(env, insn, arg_idx, insn_idx); + } else if (bpf_pseudo_kfunc_call(insn)) { + bytes = bpf_kfunc_stack_access_bytes(env, insn, arg_idx, insn_idx); + } else { + for (int f = 0; f <= depth; f++) { + err = mark_stack_read(instance, f, insn_idx, SPIS_ALL); + if (err) + return err; + } + return 0; + } + if (bytes == 0) + return 0; + + if (frame >= 0 && frame <= depth) + err = record_stack_access(instance, at, bytes, frame, insn_idx); + else if (frame == ARG_IMPRECISE) + err = record_imprecise(instance, at->mask, insn_idx); + return err; +} + /* Record stack access for a given 'at' state of helper/kfunc 'insn' */ static int record_call_access(struct bpf_verifier_env *env, struct func_instance *instance, @@ -1350,9 +1441,8 @@ static int record_call_access(struct bpf_verifier_env *env, int insn_idx) { struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; - int depth = instance->depth; struct bpf_call_summary cs; - int r, err = 0, num_params = 5; + int r, err, num_params = 5; if (bpf_pseudo_call(insn)) return 0; @@ -1360,32 +1450,15 @@ static int record_call_access(struct bpf_verifier_env *env, if (bpf_get_call_summary(env, insn, &cs)) num_params = cs.num_params; - for (r = BPF_REG_1; r < BPF_REG_1 + num_params; r++) { - int frame = at[r].frame; - s64 bytes; - - if (!arg_is_fp(&at[r])) - continue; - - if (bpf_helper_call(insn)) { - bytes = bpf_helper_stack_access_bytes(env, insn, r - 1, insn_idx); - } else if (bpf_pseudo_kfunc_call(insn)) { - bytes = bpf_kfunc_stack_access_bytes(env, insn, r - 1, insn_idx); - } else { - for (int f = 0; f <= depth; f++) { - err = mark_stack_read(instance, f, insn_idx, SPIS_ALL); - if (err) - return err; - } - return 0; - } - if (bytes == 0) - continue; + for (r = BPF_REG_1; r < BPF_REG_1 + min(num_params, MAX_BPF_FUNC_REG_ARGS); r++) { + err = record_arg_access(env, instance, insn, &at[r], r - 1, insn_idx); + if (err) + return err; + } - if (frame >= 0 && frame <= depth) - err = record_stack_access(instance, &at[r], bytes, frame, insn_idx); - else if (frame == ARG_IMPRECISE) - err = record_imprecise(instance, at[r].mask, insn_idx); + for (r = 0; r < MAX_STACK_ARG_SLOTS && r < num_params - MAX_BPF_FUNC_REG_ARGS; r++) { + err = record_arg_access(env, instance, insn, &at[MAX_BPF_REG + r], + r + MAX_BPF_FUNC_REG_ARGS, insn_idx); if (err) return err; } @@ -1445,7 +1518,7 @@ static int find_callback_subprog(struct bpf_verifier_env *env, /* Per-subprog intermediate state kept alive across analysis phases */ struct subprog_at_info { - struct arg_track (*at_in)[MAX_BPF_REG]; + struct arg_track (*at_in)[MAX_AT_TRACK_REGS]; int len; }; @@ -1479,6 +1552,9 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env, for (r = 0; r < MAX_BPF_REG - 1; r++) if (arg_is_fp(&info->at_in[i][r])) has_extra = true; + for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) + if (arg_is_fp(&info->at_in[i][MAX_BPF_REG + r])) + has_extra = true; } if (is_ldx_stx_call) { for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) @@ -1503,6 +1579,12 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env, verbose(env, " r%d=", r); verbose_arg_track(env, &info->at_in[i][r]); } + for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) { + if (!arg_is_fp(&info->at_in[i][MAX_BPF_REG + r])) + continue; + verbose(env, " sa%d=", r); + verbose_arg_track(env, &info->at_in[i][MAX_BPF_REG + r]); + } } if (is_ldx_stx_call) { @@ -1525,7 +1607,7 @@ static void print_subprog_arg_access(struct bpf_verifier_env *env, * Runs forward fixed-point with arg_track_xfer(), then records * memory accesses in a single linear pass over converged state. * - * @callee_entry: pre-populated entry state for R1-R5 + * @callee_entry: pre-populated entry state for R1-R5 and stack args * NULL for main (subprog 0). * @info: stores at_in, len for debug printing. */ @@ -1543,10 +1625,11 @@ static int compute_subprog_args(struct bpf_verifier_env *env, int end = env->subprog_info[subprog + 1].start; int po_end = env->subprog_info[subprog + 1].postorder_start; int len = end - start; - struct arg_track (*at_in)[MAX_BPF_REG] = NULL; - struct arg_track at_out[MAX_BPF_REG]; + struct arg_track (*at_in)[MAX_AT_TRACK_REGS] = NULL; + struct arg_track at_out[MAX_AT_TRACK_REGS]; struct arg_track (*at_stack_in)[MAX_ARG_SPILL_SLOTS] = NULL; struct arg_track *at_stack_out = NULL; + struct arg_track at_stack_arg_entry[MAX_STACK_ARG_SLOTS]; struct arg_track unvisited = { .frame = ARG_UNVISITED }; struct arg_track none = { .frame = ARG_NONE }; bool changed; @@ -1565,13 +1648,13 @@ static int compute_subprog_args(struct bpf_verifier_env *env, goto err_free; for (i = 0; i < len; i++) { - for (r = 0; r < MAX_BPF_REG; r++) + for (r = 0; r < MAX_AT_TRACK_REGS; r++) at_in[i][r] = unvisited; for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) at_stack_in[i][r] = unvisited; } - for (r = 0; r < MAX_BPF_REG; r++) + for (r = 0; r < MAX_AT_TRACK_REGS; r++) at_in[0][r] = none; /* Entry: R10 is always precisely the current frame's FP */ @@ -1587,6 +1670,10 @@ static int compute_subprog_args(struct bpf_verifier_env *env, for (r = 0; r < MAX_ARG_SPILL_SLOTS; r++) at_stack_in[0][r] = none; + /* Entry: incoming stack args from caller, or ARG_NONE for main */ + for (r = 0; r < MAX_STACK_ARG_SLOTS; r++) + at_stack_arg_entry[r] = callee_entry ? callee_entry[MAX_BPF_REG + r] : none; + if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "subprog#%d: analyzing (depth %d)...\n", subprog, depth); @@ -1605,7 +1692,8 @@ redo: memcpy(at_out, at_in[i], sizeof(at_out)); memcpy(at_stack_out, at_stack_in[i], MAX_ARG_SPILL_SLOTS * sizeof(*at_stack_out)); - arg_track_xfer(env, insn, idx, at_out, at_stack_out, instance, callsites); + arg_track_xfer(env, insn, idx, at_out, at_stack_out, + at_stack_arg_entry, instance, callsites); arg_track_log(env, insn, idx, at_in[i], at_stack_in[i], at_out, at_stack_out); /* Propagate to successors within this subprogram */ @@ -1619,7 +1707,7 @@ redo: continue; ti = target - start; - for (r = 0; r < MAX_BPF_REG; r++) + for (r = 0; r < MAX_AT_TRACK_REGS; r++) changed |= arg_track_join(env, idx, target, r, &at_in[ti][r], at_out[r]); @@ -1674,11 +1762,14 @@ err_free: return err; } -/* Return true if any of R1-R5 is derived from a frame pointer. */ +/* Return true if any of R1-R5 or stack args is derived from a frame pointer. */ static bool has_fp_args(struct arg_track *args) { for (int r = BPF_REG_1; r <= BPF_REG_5; r++) - if (args[r].frame != ARG_NONE) + if (arg_is_fp(&args[r])) + return true; + for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++) + if (arg_is_fp(&args[MAX_BPF_REG + r])) return true; return false; } @@ -1803,7 +1894,7 @@ static int analyze_subprog(struct bpf_verifier_env *env, /* For each reachable call site in the subprog, recurse into callees */ for (int p = po_start; p < po_end; p++) { int idx = env->cfg.insn_postorder[p]; - struct arg_track callee_args[BPF_REG_5 + 1]; + struct arg_track callee_args[MAX_AT_TRACK_REGS] = {}; struct arg_track none = { .frame = ARG_NONE }; struct bpf_insn *insn = &insns[idx]; struct func_instance *callee_instance; @@ -1818,9 +1909,11 @@ static int analyze_subprog(struct bpf_verifier_env *env, if (callee < 0) continue; - /* Build entry args: R1-R5 from at_in at call site */ + /* Build entry args: R1-R5 and stack args from at_in at call site */ for (int r = BPF_REG_1; r <= BPF_REG_5; r++) callee_args[r] = info[subprog].at_in[j][r]; + for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++) + callee_args[MAX_BPF_REG + r] = info[subprog].at_in[j][MAX_BPF_REG + r]; } else if (bpf_calls_callback(env, idx)) { callee = find_callback_subprog(env, insn, idx, &caller_reg, &cb_callee_reg); if (callee == -2) { @@ -1842,6 +1935,8 @@ static int analyze_subprog(struct bpf_verifier_env *env, for (int r = BPF_REG_1; r <= BPF_REG_5; r++) callee_args[r] = none; + for (int r = 0; r < MAX_STACK_ARG_SLOTS; r++) + callee_args[MAX_BPF_REG + r] = none; callee_args[cb_callee_reg] = info[subprog].at_in[j][caller_reg]; } else { continue; @@ -1914,26 +2009,15 @@ int bpf_compute_subprog_arg_access(struct bpf_verifier_env *env) return -ENOMEM; } - instance = call_instance(env, NULL, 0, 0); - if (IS_ERR(instance)) { - err = PTR_ERR(instance); - goto out; - } - err = analyze_subprog(env, NULL, info, instance, callsites); - if (err) - goto out; - /* - * Subprogs and callbacks that don't receive FP-derived arguments - * cannot access ancestor stack frames, so they were skipped during - * the recursive walk above. Async callbacks (timer, workqueue) are - * also not reachable from the main program's call graph. Analyze - * all unvisited subprogs as independent roots at depth 0. + * Analyze every subprog in reverse topological order (callers + * before callees) so that each subprog is analyzed before its + * callees, allowing the recursive walk inside analyze_subprog() + * to naturally reach callees that receive FP-derived args. * - * Use reverse topological order (callers before callees) so that - * each subprog is analyzed before its callees, allowing the - * recursive walk inside analyze_subprog() to naturally - * reach nested callees that also lack FP-derived args. + * Subprogs and callbacks that don't receive FP-derived arguments + * cannot access ancestor stack frames are analyzed independently. + * Async callbacks (timer, workqueue) are handled the same way. */ for (k = env->subprog_cnt - 1; k >= 0; k--) { int sub = env->subprog_topo_order[k]; @@ -2096,7 +2180,7 @@ static void compute_insn_live_regs(struct bpf_verifier_env *env, def = ALL_CALLER_SAVED_REGS; use = def & ~BIT(BPF_REG_0); if (bpf_get_call_summary(env, insn, &cs)) - use = GENMASK(cs.num_params, 1); + use = GENMASK(min_t(u8, cs.num_params, MAX_BPF_FUNC_REG_ARGS), 1); break; default: def = 0; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 011e4ec25acd..b740fa73ee26 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -13,17 +13,17 @@ #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) -static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) +static bool bpf_verifier_log_attr_valid(u32 log_level, char __user *log_buf, u32 log_size) { /* ubuf and len_total should both be specified (or not) together */ - if (!!log->ubuf != !!log->len_total) + if (!!log_buf != !!log_size) return false; /* log buf without log_level is meaningless */ - if (log->ubuf && log->level == 0) + if (log_buf && log_level == 0) return false; - if (log->level & ~BPF_LOG_MASK) + if (log_level & ~BPF_LOG_MASK) return false; - if (log->len_total > UINT_MAX >> 2) + if (log_size > UINT_MAX >> 2) return false; return true; } @@ -36,7 +36,7 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, log->len_total = log_size; /* log attributes have to be sane */ - if (!bpf_verifier_log_attr_valid(log)) + if (!bpf_verifier_log_attr_valid(log_level, log_buf, log_size)) return -EINVAL; return 0; @@ -571,20 +571,20 @@ static void print_scalar_ranges(struct bpf_verifier_env *env, u64 val; bool omit; } minmaxs[] = { - {"smin", reg->smin_value, reg->smin_value == S64_MIN}, - {"smax", reg->smax_value, reg->smax_value == S64_MAX}, - {"umin", reg->umin_value, reg->umin_value == 0}, - {"umax", reg->umax_value, reg->umax_value == U64_MAX}, + {"smin", reg_smin(reg), reg_smin(reg) == S64_MIN}, + {"smax", reg_smax(reg), reg_smax(reg) == S64_MAX}, + {"umin", reg_umin(reg), reg_umin(reg) == 0}, + {"umax", reg_umax(reg), reg_umax(reg) == U64_MAX}, {"smin32", - is_snum_decimal((s64)reg->s32_min_value) - ? (s64)reg->s32_min_value - : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN}, + is_snum_decimal((s64)reg_s32_min(reg)) + ? (s64)reg_s32_min(reg) + : (u32)reg_s32_min(reg), reg_s32_min(reg) == S32_MIN}, {"smax32", - is_snum_decimal((s64)reg->s32_max_value) - ? (s64)reg->s32_max_value - : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX}, - {"umin32", reg->u32_min_value, reg->u32_min_value == 0}, - {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX}, + is_snum_decimal((s64)reg_s32_max(reg)) + ? (s64)reg_s32_max(reg) + : (u32)reg_s32_max(reg), reg_s32_max(reg) == S32_MAX}, + {"umin32", reg_u32_min(reg), reg_u32_min(reg) == 0}, + {"umax32", reg_u32_max(reg), reg_u32_max(reg) == U32_MAX}, }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)]; bool neg1, neg2; @@ -665,8 +665,8 @@ static void print_reg_state(struct bpf_verifier_env *env, verbose_a("id=%d", reg->id & ~BPF_ADD_CONST); if (reg->id & BPF_ADD_CONST) verbose(env, "%+d", reg->delta); - if (reg->ref_obj_id) - verbose_a("ref_obj_id=%d", reg->ref_obj_id); + if (reg->parent_id) + verbose_a("parent_id=%d", reg->parent_id); if (type_is_non_owning_ref(reg->type)) verbose_a("%s", "non_own_ref"); if (type_is_map_ptr(t)) { @@ -768,21 +768,19 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); if (reg->id) verbose_a("id=%d", reg->id); - if (reg->ref_obj_id) - verbose_a("ref_id=%d", reg->ref_obj_id); - if (reg->dynptr_id) - verbose_a("dynptr_id=%d", reg->dynptr_id); + if (reg->parent_id) + verbose_a("parent_id=%d", reg->parent_id); verbose(env, ")"); break; case STACK_ITER: - /* only main slot has ref_obj_id set; skip others */ - if (!reg->ref_obj_id) + /* only main slot has id set; skip others */ + if (!reg->id) continue; - verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)", + verbose(env, " fp%d=iter_%s(id=%d,state=%s,depth=%u)", (-i - 1) * BPF_REG_SIZE, iter_type_str(reg->iter.btf, reg->iter.btf_id), - reg->ref_obj_id, iter_state_str(reg->iter.state), + reg->id, iter_state_str(reg->iter.state), reg->iter.depth); break; case STACK_MISC: @@ -825,3 +823,81 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st } print_verifier_state(env, vstate, frameno, false); } + +int bpf_log_attr_init(struct bpf_log_attr *log, u64 log_buf, u32 log_size, u32 log_level, + u32 offsetof_log_true_size, bpfptr_t uattr, struct bpf_common_attr *common, + bpfptr_t uattr_common, u32 size_common) +{ + char __user *ubuf_common = u64_to_user_ptr(common->log_buf); + char __user *ubuf = u64_to_user_ptr(log_buf); + + if (!bpf_verifier_log_attr_valid(common->log_level, ubuf_common, common->log_size) || + !bpf_verifier_log_attr_valid(log_level, ubuf, log_size)) + return -EINVAL; + + if (ubuf && ubuf_common && (ubuf != ubuf_common || log_size != common->log_size || + log_level != common->log_level)) + return -EINVAL; + + memset(log, 0, sizeof(*log)); + log->ubuf = ubuf; + log->size = log_size; + log->level = log_level; + log->offsetof_true_size = offsetof_log_true_size; + log->uattr = uattr; + + if (!ubuf && ubuf_common) { + log->ubuf = ubuf_common; + log->size = common->log_size; + log->level = common->log_level; + log->uattr = uattr_common; + log->offsetof_true_size = 0; + if (size_common >= offsetofend(struct bpf_common_attr, log_true_size)) + log->offsetof_true_size = offsetof(struct bpf_common_attr, log_true_size); + } + return 0; +} + +struct bpf_verifier_log *bpf_log_attr_create_vlog(struct bpf_log_attr *attr_log, + struct bpf_common_attr *common, bpfptr_t uattr, + u32 size) +{ + struct bpf_verifier_log *log; + int err; + + memset(attr_log, 0, sizeof(*attr_log)); + attr_log->uattr = uattr; + if (size >= offsetofend(struct bpf_common_attr, log_true_size)) + attr_log->offsetof_true_size = offsetof(struct bpf_common_attr, log_true_size); + + if (!size) + return NULL; + + log = kzalloc_obj(*log, GFP_KERNEL); + if (!log) + return ERR_PTR(-ENOMEM); + + err = bpf_vlog_init(log, common->log_level, u64_to_user_ptr(common->log_buf), + common->log_size); + if (err) { + kfree(log); + return ERR_PTR(err); + } + + return log; +} + +int bpf_log_attr_finalize(struct bpf_log_attr *attr, struct bpf_verifier_log *log) +{ + u32 log_true_size; + int err; + + err = bpf_vlog_finalize(log, &log_true_size); + + if (attr->offsetof_true_size && + copy_to_bpfptr_offset(attr->uattr, attr->offsetof_true_size, &log_true_size, + sizeof(log_true_size))) + return -EFAULT; + + return err; +} diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 0f57608b385d..4d6f25db9ba1 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -246,7 +246,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) /* Start walking the trie from the root node ... */ - for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held()); + for (node = rcu_dereference_check(trie->root, bpf_rcu_lock_held()); node;) { unsigned int next_bit; size_t matchlen; @@ -280,7 +280,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) */ next_bit = extract_bit(key->data, node->prefixlen); node = rcu_dereference_check(node->child[next_bit], - rcu_read_lock_bh_held()); + bpf_rcu_lock_held()); } if (!found) @@ -359,7 +359,7 @@ static long trie_update_elem(struct bpf_map *map, */ slot = &trie->root; - while ((node = rcu_dereference(*slot))) { + while ((node = rcu_dereference_protected(*slot, 1))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -482,7 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) trim = &trie->root; trim2 = trim; parent = NULL; - while ((node = rcu_dereference(*trim))) { + while ((node = rcu_dereference_protected(*trim, 1))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 645bd30bc9a9..d2cbab4bdf64 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -20,7 +20,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) return ERR_PTR(-EINVAL); - + if (inner_map->excl_prog_sha) + return ERR_PTR(-ENOTSUPP); if (!inner_map->ops->map_meta_equal) return ERR_PTR(-ENOTSUPP); @@ -101,6 +102,8 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; + if (inner_map->excl_prog_sha) + return ERR_PTR(-ENOTSUPP); inner_map_meta = map->inner_map_meta; if (inner_map_meta->ops->map_meta_equal(inner_map_meta, inner_map)) diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 261a03ea73d3..c19b360bad9e 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -112,6 +112,10 @@ static int bpf_iter_attach_map(struct bpf_prog *prog, map = bpf_map_get_with_uref(linfo->map.map_fd); if (IS_ERR(map)) return PTR_ERR(map); + if (map->excl_prog_sha) { + err = -EPERM; + goto put_map; + } if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -119,7 +123,8 @@ static int bpf_iter_attach_map(struct bpf_prog *prog, is_percpu = true; else if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_RHASH) goto put_map; key_acc_size = prog->aux->max_rdonly_access; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index da3d328f5c15..77ba03216c09 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -9,6 +9,7 @@ #include <linux/perf_event.h> #include <linux/btf_ids.h> #include <linux/buildid.h> +#include <linux/mmap_lock.h> #include "percpu_freelist.h" #include "mmap_unlock_work.h" @@ -152,6 +153,180 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b : build_id_parse_nofault(vma, build_id, NULL); } +static inline void stack_map_build_id_set_ip(struct bpf_stack_build_id *id) +{ + id->status = BPF_STACK_BUILD_ID_IP; + memset(id->build_id, 0, BUILD_ID_SIZE_MAX); +} + +static inline u64 stack_map_build_id_offset(unsigned long vm_pgoff, + unsigned long vm_start, u64 ip) +{ + return (vm_pgoff << PAGE_SHIFT) + ip - vm_start; +} + +static inline void stack_map_build_id_set_valid(struct bpf_stack_build_id *id, + u64 offset, + const unsigned char *build_id) +{ + id->status = BPF_STACK_BUILD_ID_VALID; + id->offset = offset; + if (id->build_id != build_id) + memcpy(id->build_id, build_id, BUILD_ID_SIZE_MAX); +} + +struct stack_map_vma_lock { + struct vm_area_struct *vma; + struct mm_struct *mm; +}; + +/* + * Acquire a stable read-side reference on the VMA covering @ip. + * + * With CONFIG_PER_VMA_LOCK=y this returns a VMA with its per-VMA read + * lock held and mmap_lock dropped, so the caller may sleep. + * + * With CONFIG_PER_VMA_LOCK=n it returns a VMA with mmap_lock still + * held; the caller must snapshot any fields it needs and pin vm_file + * with get_file() before stack_map_unlock_vma() drops mmap_lock, as + * the VMA may be split, merged, or freed after that. + * + * Returns NULL on failure, in which case no lock is held. + */ +static struct vm_area_struct * +stack_map_lock_vma(struct stack_map_vma_lock *lock, unsigned long ip) +{ + struct mm_struct *mm = lock->mm; + struct vm_area_struct *vma; + + /* noop under !CONFIG_PER_VMA_LOCK */ + vma = lock_vma_under_rcu(mm, ip); + if (vma) { + lock->vma = vma; + return vma; + } + + /* + * Taking mmap_read_lock() is unsafe here, because the caller BPF + * program might already hold it, causing a deadlock. + */ + if (!mmap_read_trylock(mm)) + return NULL; + + vma = vma_lookup(mm, ip); + if (!vma) { + mmap_read_unlock(mm); + return NULL; + } + +#ifdef CONFIG_PER_VMA_LOCK + if (!vma_start_read_locked(vma)) { + mmap_read_unlock(mm); + return NULL; + } + mmap_read_unlock(mm); +#endif + + lock->vma = vma; + return vma; +} + +static void stack_map_unlock_vma(struct stack_map_vma_lock *lock) +{ +#ifdef CONFIG_PER_VMA_LOCK + vma_end_read(lock->vma); +#else + mmap_read_unlock(lock->mm); +#endif + lock->vma = NULL; +} + +static void stack_map_get_build_id_offset_sleepable(struct bpf_stack_build_id *id_offs, + u32 trace_nr) +{ + struct mm_struct *mm = current->mm; + struct stack_map_vma_lock lock = { .mm = mm }; + struct { + struct file *file; + const unsigned char *build_id; + unsigned long vm_start; + unsigned long vm_end; + unsigned long vm_pgoff; + } cache = {}; + unsigned long vm_pgoff, vm_start, vm_end; + struct vm_area_struct *vma; + struct file *file; + u64 offset; + u64 ip; + + for (u32 i = 0; i < trace_nr; i++) { + ip = READ_ONCE(id_offs[i].ip); + + /* + * Range cache fast path: if ip falls within the previously + * resolved VMA range, reuse the cache build_id without + * re-acquiring the VMA lock. + */ + if (cache.build_id && ip >= cache.vm_start && ip < cache.vm_end) { + offset = stack_map_build_id_offset(cache.vm_pgoff, cache.vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id); + continue; + } + + vma = stack_map_lock_vma(&lock, ip); + if (!vma) { + stack_map_build_id_set_ip(&id_offs[i]); + continue; + } + if (vma_is_anonymous(vma) || !vma->vm_file) { + stack_map_build_id_set_ip(&id_offs[i]); + stack_map_unlock_vma(&lock); + continue; + } + + file = vma->vm_file; + vm_pgoff = vma->vm_pgoff; + vm_start = vma->vm_start; + vm_end = vma->vm_end; + offset = stack_map_build_id_offset(vm_pgoff, vm_start, ip); + + /* + * Same backing file as previous (e.g. different VMAs + * of the same ELF binary). Reuse the cache build_id. + */ + if (file == cache.file) { + stack_map_unlock_vma(&lock); + stack_map_build_id_set_valid(&id_offs[i], offset, cache.build_id); + cache.vm_start = vm_start; + cache.vm_end = vm_end; + cache.vm_pgoff = vm_pgoff; + continue; + } + + file = get_file(file); + stack_map_unlock_vma(&lock); + + /* build_id_parse_file() may block on filesystem reads */ + if (build_id_parse_file(file, id_offs[i].build_id, NULL)) { + stack_map_build_id_set_ip(&id_offs[i]); + fput(file); + continue; + } + + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); + if (cache.file) + fput(cache.file); + cache.file = file; + cache.build_id = id_offs[i].build_id; + cache.vm_start = vm_start; + cache.vm_end = vm_end; + cache.vm_pgoff = vm_pgoff; + } + + if (cache.file) + fput(cache.file); +} + /* * Expects all id_offs[i].ip values to be set to correct initial IPs. * They will be subsequently: @@ -165,44 +340,50 @@ static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, b static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u32 trace_nr, bool user, bool may_fault) { - int i; struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); + bool has_user_ctx = user && current && current->mm; struct vm_area_struct *vma, *prev_vma = NULL; - const char *prev_build_id; + const unsigned char *prev_build_id = NULL; + int i; + + if (may_fault && has_user_ctx) { + stack_map_get_build_id_offset_sleepable(id_offs, trace_nr); + return; + } /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with * build_id. */ - if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock(current->mm)) { + if (!has_user_ctx || irq_work_busy || !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ - for (i = 0; i < trace_nr; i++) { - id_offs[i].status = BPF_STACK_BUILD_ID_IP; - memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); - } + for (i = 0; i < trace_nr; i++) + stack_map_build_id_set_ip(&id_offs[i]); return; } for (i = 0; i < trace_nr; i++) { u64 ip = READ_ONCE(id_offs[i].ip); + u64 offset; - if (range_in_vma(prev_vma, ip, ip)) { + if (prev_build_id && range_in_vma(prev_vma, ip, ip)) { vma = prev_vma; - memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX); - goto build_id_valid; + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, prev_build_id); + continue; } vma = find_vma(current->mm, ip); - if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) { + if (!vma || vma_is_anonymous(vma) || + fetch_build_id(vma, id_offs[i].build_id, may_fault)) { /* per entry fall back to ips */ - id_offs[i].status = BPF_STACK_BUILD_ID_IP; - memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); + stack_map_build_id_set_ip(&id_offs[i]); + prev_vma = vma; + prev_build_id = NULL; continue; } -build_id_valid: - id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start; - id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + offset = stack_map_build_id_offset(vma->vm_pgoff, vma->vm_start, ip); + stack_map_build_id_set_valid(&id_offs[i], offset, id_offs[i].build_id); prev_vma = vma; prev_build_id = id_offs[i].build_id; } diff --git a/kernel/bpf/states.c b/kernel/bpf/states.c index 8478d2c6ed5b..32f346ce3ffc 100644 --- a/kernel/bpf/states.c +++ b/kernel/bpf/states.c @@ -2,6 +2,7 @@ /* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> +#include <linux/cnum.h> #include <linux/filter.h> #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) @@ -301,14 +302,8 @@ int bpf_update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_s static bool range_within(const struct bpf_reg_state *old, const struct bpf_reg_state *cur) { - return old->umin_value <= cur->umin_value && - old->umax_value >= cur->umax_value && - old->smin_value <= cur->smin_value && - old->smax_value >= cur->smax_value && - old->u32_min_value <= cur->u32_min_value && - old->u32_max_value >= cur->u32_max_value && - old->s32_min_value <= cur->s32_min_value && - old->s32_max_value >= cur->s32_max_value; + return cnum64_is_subset(old->r64, cur->r64) && + cnum32_is_subset(old->r32, cur->r32); } /* If in the old state two registers had the same id, then they need to have @@ -348,8 +343,12 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_idmap *idmap) return true; } - /* We ran out of idmap slots, which should be impossible */ - WARN_ON_ONCE(1); + /* + * idmap slots are bounded by the number of registers and stack slots. + * Since referenced dynptrs acquire intermediate references that do + * not live in either, so the map can be exhausted. Since it is unlikely, + * fail the verification by treating the states as not equivalent. + */ return false; } @@ -494,7 +493,7 @@ static bool regs_exact(const struct bpf_reg_state *rold, { return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && check_ids(rold->id, rcur->id, idmap) && - check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); + check_ids(rold->parent_id, rcur->parent_id, idmap); } enum exact_level { @@ -619,7 +618,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off) && check_ids(rold->id, rcur->id, idmap) && - check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); + check_ids(rold->parent_id, rcur->parent_id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: /* We must have at least as much range as the old ptr @@ -799,7 +798,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, cur_reg = &cur->stack[spi].spilled_ptr; if (old_reg->dynptr.type != cur_reg->dynptr.type || old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot || - !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + !check_ids(old_reg->id, cur_reg->id, idmap) || + !check_ids(old_reg->parent_id, cur_reg->parent_id, idmap)) return false; break; case STACK_ITER: @@ -815,13 +815,13 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, old_reg->iter.btf_id != cur_reg->iter.btf_id || old_reg->iter.state != cur_reg->iter.state || /* ignore {old_reg,cur_reg}->iter.depth, see above */ - !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + !check_ids(old_reg->id, cur_reg->id, idmap)) return false; break; case STACK_IRQ_FLAG: old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || + if (!check_ids(old_reg->id, cur_reg->id, idmap) || old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) return false; break; @@ -838,6 +838,32 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return true; } +/* + * Compare stack arg slots between old and current states. + * Outgoing stack args are path-local state and must agree for pruning. + */ +static bool stack_arg_safe(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur, struct bpf_idmap *idmap, + enum exact_level exact) +{ + int i, nslots; + + nslots = max(old->out_stack_arg_cnt, cur->out_stack_arg_cnt); + for (i = 0; i < nslots; i++) { + struct bpf_reg_state *old_arg, *cur_arg; + struct bpf_reg_state not_init = { .type = NOT_INIT }; + + old_arg = i < old->out_stack_arg_cnt ? + &old->stack_arg_regs[i] : ¬_init; + cur_arg = i < cur->out_stack_arg_cnt ? + &cur->stack_arg_regs[i] : ¬_init; + if (!regsafe(env, old_arg, cur_arg, idmap, exact)) + return false; + } + + return true; +} + static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur, struct bpf_idmap *idmap) { @@ -868,6 +894,9 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c return false; switch (old->refs[i].type) { case REF_TYPE_PTR: + if (!check_ids(old->refs[i].parent_id, cur->refs[i].parent_id, idmap)) + return false; + break; case REF_TYPE_IRQ: break; case REF_TYPE_LOCK: @@ -920,6 +949,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat if (old->callback_depth > cur->callback_depth) return false; + if (!old->no_stack_arg_load && cur->no_stack_arg_load) + return false; + for (i = 0; i < MAX_BPF_REG; i++) if (((1 << i) & live_regs) && !regsafe(env, &old->regs[i], &cur->regs[i], @@ -929,6 +961,9 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat if (!stacksafe(env, old, cur, &env->idmap_scratch, exact)) return false; + if (!stack_arg_safe(env, old, cur, &env->idmap_scratch, exact)) + return false; + return true; } @@ -1376,7 +1411,7 @@ hit: */ err = 0; if (bpf_is_jmp_point(env, env->insn_idx)) - err = bpf_push_jmp_history(env, cur, 0, 0); + err = bpf_push_jmp_history(env, cur, 0, 0, 0, 0); err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a3c0214ca934..b44106c8ea75 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -41,6 +41,7 @@ #include <linux/overflow.h> #include <linux/cookie.h> #include <linux/verification.h> +#include <linux/btf_ids.h> #include <net/netfilter/nf_bpf_link.h> #include <net/netkit.h> @@ -807,6 +808,11 @@ void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) bpf_task_work_cancel_and_free(obj + rec->task_work_off); } +void bpf_obj_cancel_fields(struct bpf_map *map, void *obj) +{ + bpf_map_free_internal_structs(map, obj); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -1280,6 +1286,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_SPIN_LOCK: case BPF_RES_SPIN_LOCK: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && @@ -1294,6 +1301,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_WORKQUEUE: case BPF_TASK_WORK: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { ret = -EOPNOTSUPP; @@ -1305,6 +1313,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case BPF_KPTR_PERCPU: case BPF_REFCOUNT: if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_RHASH && map->map_type != BPF_MAP_TYPE_PERCPU_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH && @@ -1359,7 +1368,8 @@ free_map_tab: #define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int map_create(union bpf_attr *attr, bpfptr_t uattr) +static int map_create_alloc(union bpf_attr *attr, bpfptr_t uattr, struct bpf_verifier_log *log, + struct bpf_map **mapp, struct bpf_token **tokenp) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1367,12 +1377,13 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) u32 map_type = attr->map_type; struct bpf_map *map; bool token_flag; - int f_flags; int err; err = CHECK_ATTR(BPF_MAP_CREATE); - if (err) + if (err) { + bpf_log(log, "Invalid attr.\n"); return -EINVAL; + } /* check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it * to avoid per-map type checks tripping on unknown flag @@ -1381,31 +1392,40 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) attr->map_flags &= ~BPF_F_TOKEN_FD; if (attr->btf_vmlinux_value_type_id) { - if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS || - attr->btf_key_type_id || attr->btf_value_type_id) + if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS) { + bpf_log(log, "btf_vmlinux_value_type_id can only be used with struct_ops maps.\n"); return -EINVAL; + } + if (attr->btf_key_type_id || attr->btf_value_type_id) { + bpf_log(log, "btf_vmlinux_value_type_id is mutually exclusive with btf_key_type_id and btf_value_type_id.\n"); + return -EINVAL; + } } else if (attr->btf_key_type_id && !attr->btf_value_type_id) { + bpf_log(log, "Invalid btf_value_type_id.\n"); return -EINVAL; } if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER && attr->map_type != BPF_MAP_TYPE_ARENA && - attr->map_extra != 0) + attr->map_type != BPF_MAP_TYPE_RHASH && + attr->map_extra != 0) { + bpf_log(log, "Invalid map_extra.\n"); return -EINVAL; - - f_flags = bpf_get_file_flag(attr->map_flags); - if (f_flags < 0) - return f_flags; + } if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || - !node_online(numa_node))) + !node_online(numa_node))) { + bpf_log(log, "Invalid numa_node.\n"); return -EINVAL; + } /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map_type = attr->map_type; - if (map_type >= ARRAY_SIZE(bpf_map_types)) + if (map_type >= ARRAY_SIZE(bpf_map_types)) { + bpf_log(log, "Invalid map_type.\n"); return -EINVAL; + } map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types)); ops = bpf_map_types[map_type]; if (!ops) @@ -1423,8 +1443,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) if (token_flag) { token = bpf_token_get_from_fd(attr->map_token_fd); - if (IS_ERR(token)) + if (IS_ERR(token)) { + bpf_log(log, "Invalid map_token_fd.\n"); return PTR_ERR(token); + } /* if current token doesn't grant map creation permissions, * then we can't use this token, so ignore it and rely on @@ -1457,6 +1479,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_MAP_TYPE_CGROUP_ARRAY: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_RHASH: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: @@ -1507,8 +1530,10 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) err = bpf_obj_name_cpy(map->name, attr->map_name, sizeof(attr->map_name)); - if (err < 0) + if (err < 0) { + bpf_log(log, "Invalid map_name.\n"); goto free_map; + } preempt_disable(); map->cookie = gen_cookie_next(&bpf_map_cookie); @@ -1531,6 +1556,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { + bpf_log(log, "Invalid btf_fd.\n"); err = PTR_ERR(btf); goto free_map; } @@ -1558,6 +1584,7 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { + bpf_log(log, "Invalid excl_prog_hash_size.\n"); err = -EINVAL; goto free_map; } @@ -1572,11 +1599,62 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) err = -EFAULT; goto free_map; } + + /* See libbpf: emit_signature_match() */ + BUILD_BUG_ON(offsetof(struct bpf_map, excl) != SHA256_DIGEST_SIZE); + BUILD_BUG_ON(!__same_type(map->excl, u32)); + BUILD_BUG_ON(offsetof(struct bpf_map, sha) != 0); + BUILD_BUG_ON(!__same_type(map->sha, u8[SHA256_DIGEST_SIZE])); + map->excl = 1; } else if (attr->excl_prog_hash_size) { + bpf_log(log, "Invalid excl_prog_hash_size.\n"); err = -EINVAL; goto free_map; } + *mapp = map; + *tokenp = token; + return 0; + +free_map: + bpf_map_free(map); +put_token: + bpf_token_put(token); + return err; +} + +static int map_create(union bpf_attr *attr, bpfptr_t uattr, struct bpf_common_attr *attr_common, + bpfptr_t uattr_common, u32 size_common) +{ + struct bpf_token *token = NULL; + struct bpf_verifier_log *log; + struct bpf_log_attr attr_log; + struct bpf_map *map = NULL; + int err, ret; + int f_flags; + + log = bpf_log_attr_create_vlog(&attr_log, attr_common, uattr_common, size_common); + if (IS_ERR(log)) + return PTR_ERR(log); + + err = map_create_alloc(attr, uattr, log, &map, &token); + + /* preserve original error even if log finalization is successful */ + ret = bpf_log_attr_finalize(&attr_log, log); + if (ret) + err = ret; + + kfree(log); + + if (err) + goto free_map; + + f_flags = bpf_get_file_flag(attr->map_flags); + if (f_flags < 0) { + err = f_flags; + goto free_map; + } + err = security_bpf_map_create(map, attr, token, uattr.is_kernel); if (err) goto free_map_sec; @@ -1605,8 +1683,8 @@ static int map_create(union bpf_attr *attr, bpfptr_t uattr) free_map_sec: security_bpf_map_free(map); free_map: - bpf_map_free(map); -put_token: + if (map) + bpf_map_free(map); bpf_token_put(token); return err; } @@ -2192,6 +2270,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_RHASH || map->map_type == BPF_MAP_TYPE_STACK_TRACE) { if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); @@ -2646,7 +2725,8 @@ static int bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, struct btf *attach_btf, u32 btf_id, - struct bpf_prog *dst_prog) + struct bpf_prog *dst_prog, + bool multi_func) { if (btf_id) { if (btf_id > BTF_MAX_TYPE) @@ -2666,6 +2746,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } + if (multi_func) { + if (prog_type != BPF_PROG_TYPE_TRACING) + return -EINVAL; + if (!attach_btf || btf_id) + return -EINVAL; + return 0; + } + if (attach_btf && (!btf_id || dst_prog)) return -EINVAL; @@ -2798,8 +2886,22 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } } +static enum bpf_sig_keyring bpf_classify_keyring(s32 keyring_id) +{ + switch (keyring_id) { + case 0: + return BPF_SIG_KEYRING_BUILTIN; + case (s32)(unsigned long)VERIFY_USE_SECONDARY_KEYRING: + return BPF_SIG_KEYRING_SECONDARY; + case (s32)(unsigned long)VERIFY_USE_PLATFORM_KEYRING: + return BPF_SIG_KEYRING_PLATFORM; + default: + return BPF_SIG_KEYRING_USER; + } +} + static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, - bool is_kernel) + bool is_kernel, s32 *keyring_serial) { bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); struct bpf_dynptr_kern sig_ptr, insns_ptr; @@ -2835,7 +2937,8 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, (struct bpf_dynptr *)&sig_ptr, key); - + if (!err) + *keyring_serial = bpf_key_serial(key); bpf_key_put(key); kvfree(sig); return err; @@ -2858,10 +2961,15 @@ static int bpf_prog_mark_insn_arrays_ready(struct bpf_prog *prog) return 0; } +extern int bpf_multi_func(void); +int __init __used bpf_multi_func(void) { return 0; } + +BTF_ID_LIST_GLOBAL_SINGLE(bpf_multi_func_btf_id, func, bpf_multi_func) + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD keyring_id -static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; @@ -2870,6 +2978,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) bool bpf_cap; int err; char license[128]; + bool multi_func; if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; @@ -2936,6 +3045,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON)) goto put_token; + multi_func = is_tracing_multi(attr->expected_attach_type); + /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog * or btf, we need to check which one it is */ @@ -2957,7 +3068,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) goto put_token; } } - } else if (attr->attach_btf_id) { + } else if (attr->attach_btf_id || multi_func) { /* fall back to vmlinux BTF, if BTF type ID is specified */ attach_btf = bpf_get_btf_vmlinux(); if (IS_ERR(attach_btf)) { @@ -2973,7 +3084,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (bpf_prog_load_check_attach(type, attr->expected_attach_type, attach_btf, attr->attach_btf_id, - dst_prog)) { + dst_prog, multi_func)) { if (dst_prog) bpf_prog_put(dst_prog); if (attach_btf) @@ -2996,7 +3107,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) prog->expected_attach_type = attr->expected_attach_type; prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE); prog->aux->attach_btf = attach_btf; - prog->aux->attach_btf_id = attr->attach_btf_id; + prog->aux->attach_btf_id = multi_func ? bpf_multi_func_btf_id[0] : attr->attach_btf_id; prog->aux->dst_prog = dst_prog; prog->aux->dev_bound = !!attr->prog_ifindex; prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; @@ -3022,13 +3133,17 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; - if (attr->signature) { - err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); + err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel, + &prog->aux->sig.keyring_serial); if (err) goto free_prog; + prog->aux->sig.keyring_type = bpf_classify_keyring(attr->keyring_id); + prog->aux->sig.verdict = BPF_SIG_VERIFIED; + } else { + prog->aux->sig.keyring_type = BPF_SIG_KEYRING_NONE; + prog->aux->sig.verdict = BPF_SIG_UNSIGNED; } - prog->orig_prog = NULL; prog->jited = 0; @@ -3076,10 +3191,10 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); if (err) - goto free_prog_sec; + goto free_prog; /* run eBPF verifier */ - err = bpf_check(&prog, attr, uattr, uattr_size); + err = bpf_check(&prog, attr, uattr, attr_log); if (err < 0) goto free_used_maps; @@ -3122,8 +3237,6 @@ free_used_maps: __bpf_prog_put_noref(prog, prog->aux->real_func_cnt); return err; -free_prog_sec: - security_bpf_prog_free(prog); free_prog: free_uid(prog->aux->user); if (prog->aux->attach_btf) @@ -3198,6 +3311,15 @@ void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, bpf_link_init_sleepable(link, type, ops, prog, attach_type, false); } +void bpf_tramp_link_init(struct bpf_tramp_link *link, enum bpf_link_type type, + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type, u64 cookie) +{ + bpf_link_init(&link->link, type, ops, prog, attach_type); + link->node.link = &link->link; + link->node.cookie = cookie; +} + static void bpf_link_free_id(int id) { if (!id) @@ -3358,7 +3480,7 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ? "kretprobe_multi" : "kprobe_multi"); else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI) - seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ? + seq_printf(m, "link_type:\t%s\n", link->flags & BPF_F_UPROBE_MULTI_RETURN ? "uretprobe_multi" : "uprobe_multi"); else seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); @@ -3505,7 +3627,7 @@ static void bpf_tracing_link_release(struct bpf_link *link) struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); - WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link, + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link.node, tr_link->trampoline, tr_link->tgt_prog)); @@ -3518,8 +3640,7 @@ static void bpf_tracing_link_release(struct bpf_link *link) static void bpf_tracing_link_dealloc(struct bpf_link *link) { - struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link.link); + struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); kfree(tr_link); } @@ -3527,8 +3648,8 @@ static void bpf_tracing_link_dealloc(struct bpf_link *link) static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { - struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link.link); + struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); + u32 target_btf_id, target_obj_id; bpf_trampoline_unpack_key(tr_link->trampoline->key, @@ -3541,17 +3662,16 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, link->attach_type, target_obj_id, target_btf_id, - tr_link->link.cookie); + tr_link->link.node.cookie); } static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { - struct bpf_tracing_link *tr_link = - container_of(link, struct bpf_tracing_link, link.link); + struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); info->tracing.attach_type = link->attach_type; - info->tracing.cookie = tr_link->link.cookie; + info->tracing.cookie = tr_link->link.node.cookie; bpf_trampoline_unpack_key(tr_link->trampoline->key, &info->tracing.target_obj_id, &info->tracing.target_btf_id); @@ -3633,29 +3753,18 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } - if (prog->expected_attach_type == BPF_TRACE_FSESSION) { - struct bpf_fsession_link *fslink; - - fslink = kzalloc_obj(*fslink, GFP_USER); - if (fslink) { - bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog, attach_type); - fslink->fexit.cookie = bpf_cookie; - link = &fslink->link; - } else { - link = NULL; - } - } else { - link = kzalloc_obj(*link, GFP_USER); - } + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; goto out_put_prog; } - bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog, attach_type); + bpf_tramp_link_init(&link->link, BPF_LINK_TYPE_TRACING, + &bpf_tracing_link_lops, prog, attach_type, bpf_cookie); - link->link.cookie = bpf_cookie; + if (prog->expected_attach_type == BPF_TRACE_FSESSION) { + link->fexit.link = &link->link.link; + link->fexit.cookie = bpf_cookie; + } mutex_lock(&prog->aux->dst_mutex); @@ -3758,7 +3867,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, if (err) goto out_unlock; - err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog); + err = bpf_trampoline_link_prog(&link->link.node, tr, tgt_prog); if (err) { bpf_link_cleanup(&link_primer); link = NULL; @@ -4281,6 +4390,11 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, if (!btp) return -ENOENT; + if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { + bpf_put_raw_tracepoint(btp); + return -EINVAL; + } + link = kzalloc_obj(*link, GFP_USER); if (!link) { err = -ENOMEM; @@ -4389,6 +4503,9 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FSESSION_MULTI: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: case BPF_MODIFY_RETURN: return BPF_PROG_TYPE_TRACING; case BPF_LSM_MAC: @@ -4654,7 +4771,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) #define BPF_PROG_QUERY_LAST_FIELD query.revision static int bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) + union bpf_attr __user *uattr, u32 uattr_size) { if (!bpf_net_capable()) return -EPERM; @@ -4693,7 +4810,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: case BPF_LSM_CGROUP: - return cgroup_bpf_prog_query(attr, uattr); + return cgroup_bpf_prog_query(attr, uattr, uattr_size); case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: @@ -4919,6 +5036,29 @@ out: return map; } +static void prepare_dump_pseudo_call(struct bpf_insn *insn) +{ + s32 call_off = insn->imm; + + /* + * BPF_CALL_ARGS only exists for interpreter fallback. + * 1. For interpreter (BPF_CALL_ARGS): insn->off is the index of + * interpreters_args array, so here using bpf_call_args_imm() + * to get the real address offset. + * 2. For JIT (BPF_CALL): insn->off is the subprog id. + */ + if (insn->code == (BPF_JMP | BPF_CALL_ARGS)) + insn->imm = bpf_call_args_imm(insn->off); + else + insn->imm = insn->off; + + /* Avoid dumping a truncated and misleading pc-relative offset. */ + if (call_off > S16_MAX || call_off < S16_MIN) + insn->off = 0; + else + insn->off = call_off; +} + static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, const struct cred *f_cred) { @@ -4944,6 +5084,9 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, } if (code == (BPF_JMP | BPF_CALL) || code == (BPF_JMP | BPF_CALL_ARGS)) { + /* Restore the legacy xlated dump layout. */ + if (insns[i].src_reg == BPF_PSEUDO_CALL) + prepare_dump_pseudo_call(&insns[i]); if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; if (!bpf_dump_raw_ok(f_cred)) @@ -5019,10 +5162,11 @@ static int bpf_prog_get_info_by_fd(struct file *file, u32 info_len = attr->info.info_len; struct bpf_prog_kstats stats; char __user *uinsns; - u32 ulen; + u32 ulen, len; int err; - err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); + len = offsetofend(struct bpf_prog_info, attach_btf_id); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -5304,10 +5448,11 @@ static int bpf_map_get_info_by_fd(struct file *file, { struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_map_info info; - u32 info_len = attr->info.info_len; + u32 info_len = attr->info.info_len, len; int err; - err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(info), info_len); + len = offsetofend(struct bpf_map_info, hash_size); + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), len, info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -5345,18 +5490,16 @@ static int bpf_map_get_info_by_fd(struct file *file, if (!map->ops->map_get_hash) return -EINVAL; - - if (info.hash_size != SHA256_DIGEST_SIZE) + if (info.hash_size != sizeof(map->sha)) return -EINVAL; - if (!READ_ONCE(map->frozen)) return -EPERM; - err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); + err = map->ops->map_get_hash(map); if (err != 0) return err; - if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0) + if (copy_to_user(uhash, map->sha, sizeof(map->sha)) != 0) return -EFAULT; } else if (info.hash_size) { return -EINVAL; @@ -5469,7 +5612,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, #define BPF_BTF_LOAD_LAST_FIELD btf_token_fd -static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) +static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_log_attr *attr_log) { struct bpf_token *token = NULL; @@ -5496,7 +5639,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_ bpf_token_put(token); - return btf_new_fd(attr, uattr, uattr_size); + return btf_new_fd(attr, uattr, attr_log); } #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd @@ -5697,7 +5840,7 @@ err_put: return err; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid +#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.path_fd static int link_create(union bpf_attr *attr, bpfptr_t uattr) { struct bpf_prog *prog; @@ -5748,6 +5891,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) ret = cgroup_bpf_link_attach(attr, prog); + else if (is_tracing_multi(prog->expected_attach_type)) + ret = bpf_tracing_multi_attach(prog, attr); else ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, @@ -6206,8 +6351,12 @@ put_prog: return ret; } -static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) +static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size, + bpfptr_t uattr_common, unsigned int size_common) { + struct bpf_common_attr attr_common; + u32 offsetof_log_true_size = 0; + struct bpf_log_attr attr_log; union bpf_attr attr; int err; @@ -6221,13 +6370,29 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; + memset(&attr_common, 0, sizeof(attr_common)); + if (cmd & BPF_COMMON_ATTRS) { + err = bpf_check_uarg_tail_zero(uattr_common, + offsetofend(struct bpf_common_attr, log_true_size), + size_common); + if (err) + return err; + + cmd &= ~BPF_COMMON_ATTRS; + size_common = min_t(u32, size_common, sizeof(attr_common)); + if (copy_from_bpfptr(&attr_common, uattr_common, size_common) != 0) + return -EFAULT; + } else { + size_common = 0; + } + err = security_bpf(cmd, &attr, size, uattr.is_kernel); if (err < 0) return err; switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr, uattr); + err = map_create(&attr, uattr, &attr_common, uattr_common, size_common); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); @@ -6245,7 +6410,12 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) err = map_freeze(&attr); break; case BPF_PROG_LOAD: - err = bpf_prog_load(&attr, uattr, size); + if (size >= offsetofend(union bpf_attr, log_true_size)) + offsetof_log_true_size = offsetof(union bpf_attr, log_true_size); + err = bpf_log_attr_init(&attr_log, attr.log_buf, attr.log_size, attr.log_level, + offsetof_log_true_size, uattr, &attr_common, uattr_common, + size_common); + err = err ?: bpf_prog_load(&attr, uattr, &attr_log); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); @@ -6260,7 +6430,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) err = bpf_prog_detach(&attr); break; case BPF_PROG_QUERY: - err = bpf_prog_query(&attr, uattr.user); + err = bpf_prog_query(&attr, uattr.user, size); break; case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr.user); @@ -6290,7 +6460,12 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) err = bpf_raw_tracepoint_open(&attr); break; case BPF_BTF_LOAD: - err = bpf_btf_load(&attr, uattr, size); + if (size >= offsetofend(union bpf_attr, btf_log_true_size)) + offsetof_log_true_size = offsetof(union bpf_attr, btf_log_true_size); + err = bpf_log_attr_init(&attr_log, attr.btf_log_buf, attr.btf_log_size, + attr.btf_log_level, offsetof_log_true_size, uattr, + &attr_common, uattr_common, size_common); + err = err ?: bpf_btf_load(&attr, uattr, &attr_log); break; case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); @@ -6356,9 +6531,10 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) return err; } -SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +SYSCALL_DEFINE5(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size, + struct bpf_common_attr __user *, uattr_common, unsigned int, size_common) { - return __sys_bpf(cmd, USER_BPFPTR(uattr), size); + return __sys_bpf(cmd, USER_BPFPTR(uattr), size, USER_BPFPTR(uattr_common), size_common); } static bool syscall_prog_is_valid_access(int off, int size, @@ -6388,7 +6564,7 @@ BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) default: return -EINVAL; } - return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); + return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size, KERNEL_BPFPTR(NULL), 0); } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index f02254a21585..1a721fc4bef5 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -30,8 +30,46 @@ static struct hlist_head trampoline_ip_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline tables */ static DEFINE_MUTEX(trampoline_mutex); +/* + * Keep 32 trampoline locks (5 bits) in the pool so trampoline_lock_all() + * stays below MAX_LOCK_DEPTH. Each pool slot has a distinct lockdep + * class because trampoline_lock_all() takes all pool mutexes at once; + * otherwise lockdep would report recursive locking on same-class mutexes. + */ +#define TRAMPOLINE_LOCKS_BITS 5 +#define TRAMPOLINE_LOCKS_TABLE_SIZE (1 << TRAMPOLINE_LOCKS_BITS) + +static struct { + struct mutex mutex; + struct lock_class_key key; +} trampoline_locks[TRAMPOLINE_LOCKS_TABLE_SIZE]; + +static struct mutex *select_trampoline_lock(struct bpf_trampoline *tr) +{ + return &trampoline_locks[hash_ptr(tr, TRAMPOLINE_LOCKS_BITS)].mutex; +} + +static void trampoline_lock(struct bpf_trampoline *tr) +{ + mutex_lock(select_trampoline_lock(tr)); +} + +static void trampoline_unlock(struct bpf_trampoline *tr) +{ + mutex_unlock(select_trampoline_lock(tr)); +} + +struct bpf_trampoline_ops { + int (*register_fentry)(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *data); + int (*unregister_fentry)(struct bpf_trampoline *tr, u32 orig_flags, void *data); + int (*modify_fentry)(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im, + bool lock_direct_mutex, void *data); +}; + #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex, + const struct bpf_trampoline_ops *ops, void *data); +static const struct bpf_trampoline_ops trampoline_ops; #ifdef CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS static struct bpf_trampoline *direct_ops_ip_lookup(struct ftrace_ops *ops, unsigned long ip) @@ -69,9 +107,9 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { /* This is called inside register_ftrace_direct_multi(), so - * tr->mutex is already locked. + * trampoline's mutex is already locked. */ - lockdep_assert_held_once(&tr->mutex); + lockdep_assert_held_once(select_trampoline_lock(tr)); /* Instead of updating the trampoline here, we propagate * -EAGAIN to register_ftrace_direct(). Then we can @@ -91,7 +129,7 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, } /* The normal locking order is - * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) + * select_trampoline_lock(tr) => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) * * The following two commands are called from * @@ -99,12 +137,12 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, * cleanup_direct_functions_after_ipmodify * * In both cases, direct_mutex is already locked. Use - * mutex_trylock(&tr->mutex) to avoid deadlock in race condition - * (something else is making changes to this same trampoline). + * mutex_trylock(select_trampoline_lock(tr)) to avoid deadlock in race condition + * (something else holds the same pool lock). */ - if (!mutex_trylock(&tr->mutex)) { - /* sleep 1 ms to make sure whatever holding tr->mutex makes - * some progress. + if (!mutex_trylock(select_trampoline_lock(tr))) { + /* sleep 1 ms to make sure whatever holding select_trampoline_lock(tr) + * makes some progress. */ msleep(1); return -EAGAIN; @@ -116,20 +154,22 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, unsigned long ip, if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) - ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */, + &trampoline_ops, NULL); break; case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER: tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY; if (tr->flags & BPF_TRAMP_F_ORIG_STACK) - ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); + ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */, + &trampoline_ops, NULL); break; default: ret = -EINVAL; break; } - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return ret; } #endif @@ -142,7 +182,9 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog) switch (ptype) { case BPF_PROG_TYPE_TRACING: if (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || - eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION) + eatype == BPF_MODIFY_RETURN || eatype == BPF_TRACE_FSESSION || + eatype == BPF_TRACE_FENTRY_MULTI || eatype == BPF_TRACE_FEXIT_MULTI || + eatype == BPF_TRACE_FSESSION_MULTI) return true; return false; case BPF_PROG_TYPE_LSM: @@ -359,7 +401,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key, unsigned long ip) head = &trampoline_ip_table[hash_64(tr->ip, TRAMPOLINE_HASH_BITS)]; hlist_add_head(&tr->hlist_ip, head); refcount_set(&tr->refcnt, 1); - mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); out: @@ -386,9 +427,11 @@ static int bpf_trampoline_update_fentry(struct bpf_trampoline *tr, u32 orig_flag return bpf_arch_text_poke(ip, old_t, new_t, old_addr, new_addr); } -static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, - void *old_addr) +static void bpf_tramp_image_put(struct bpf_tramp_image *im); + +static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, void *data __maybe_unused) { + void *old_addr = tr->cur_image->image; int ret; if (tr->func.ftrace_managed) @@ -396,13 +439,19 @@ static int unregister_fentry(struct bpf_trampoline *tr, u32 orig_flags, else ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, NULL); - return ret; + if (ret) + return ret; + + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = NULL; + return 0; } -static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, - void *old_addr, void *new_addr, - bool lock_direct_mutex) +static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im, + bool lock_direct_mutex, void *data __maybe_unused) { + void *old_addr = tr->cur_image->image; + void *new_addr = im->image; int ret; if (tr->func.ftrace_managed) { @@ -411,12 +460,20 @@ static int modify_fentry(struct bpf_trampoline *tr, u32 orig_flags, ret = bpf_trampoline_update_fentry(tr, orig_flags, old_addr, new_addr); } - return ret; + + if (ret) + return ret; + + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = im; + return 0; } /* first time registering */ -static int register_fentry(struct bpf_trampoline *tr, void *new_addr) +static int register_fentry(struct bpf_trampoline *tr, struct bpf_tramp_image *im, + void *data __maybe_unused) { + void *new_addr = im->image; void *ip = tr->func.addr; unsigned long faddr; int ret; @@ -434,33 +491,42 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) ret = bpf_trampoline_update_fentry(tr, 0, NULL, new_addr); } - return ret; + if (ret) + return ret; + + tr->cur_image = im; + return 0; } -static struct bpf_tramp_links * +static const struct bpf_trampoline_ops trampoline_ops = { + .register_fentry = register_fentry, + .unregister_fentry = unregister_fentry, + .modify_fentry = modify_fentry, +}; + +static struct bpf_tramp_nodes * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { - struct bpf_tramp_link *link; - struct bpf_tramp_links *tlinks; - struct bpf_tramp_link **links; + struct bpf_tramp_node *node, **nodes; + struct bpf_tramp_nodes *tnodes; int kind; *total = 0; - tlinks = kzalloc_objs(*tlinks, BPF_TRAMP_MAX); - if (!tlinks) + tnodes = kzalloc_objs(*tnodes, BPF_TRAMP_MAX); + if (!tnodes) return ERR_PTR(-ENOMEM); for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { - tlinks[kind].nr_links = tr->progs_cnt[kind]; + tnodes[kind].nr_nodes = tr->progs_cnt[kind]; *total += tr->progs_cnt[kind]; - links = tlinks[kind].links; + nodes = tnodes[kind].nodes; - hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { - *ip_arg |= link->link.prog->call_get_func_ip; - *links++ = link; + hlist_for_each_entry(node, &tr->progs_hlist[kind], tramp_hlist) { + *ip_arg |= node->link->prog->call_get_func_ip; + *nodes++ = node; } } - return tlinks; + return tnodes; } static void bpf_tramp_image_free(struct bpf_tramp_image *im) @@ -604,30 +670,29 @@ out: return ERR_PTR(err); } -static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) +static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex, + const struct bpf_trampoline_ops *ops, void *data) { struct bpf_tramp_image *im; - struct bpf_tramp_links *tlinks; + struct bpf_tramp_nodes *tnodes; u32 orig_flags = tr->flags; bool ip_arg = false; int err, total, size; - tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); - if (IS_ERR(tlinks)) - return PTR_ERR(tlinks); + tnodes = bpf_trampoline_get_progs(tr, &total, &ip_arg); + if (IS_ERR(tnodes)) + return PTR_ERR(tnodes); if (total == 0) { - err = unregister_fentry(tr, orig_flags, tr->cur_image->image); - bpf_tramp_image_put(tr->cur_image); - tr->cur_image = NULL; + err = ops->unregister_fentry(tr, orig_flags, data); goto out; } /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); - if (tlinks[BPF_TRAMP_FEXIT].nr_links || - tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { + if (tnodes[BPF_TRAMP_FEXIT].nr_nodes || + tnodes[BPF_TRAMP_MODIFY_RETURN].nr_nodes) { /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME * should not be set together. */ @@ -658,7 +723,7 @@ again: #endif size = arch_bpf_trampoline_size(&tr->func.model, tr->flags, - tlinks, tr->func.addr); + tnodes, tr->func.addr); if (size < 0) { err = size; goto out; @@ -676,7 +741,7 @@ again: } err = arch_prepare_bpf_trampoline(im, im->image, im->image + size, - &tr->func.model, tr->flags, tlinks, + &tr->func.model, tr->flags, tnodes, tr->func.addr); if (err < 0) goto out_free; @@ -685,14 +750,12 @@ again: if (err) goto out_free; - WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, orig_flags, tr->cur_image->image, - im->image, lock_direct_mutex); + err = ops->modify_fentry(tr, orig_flags, im, lock_direct_mutex, data); else /* first time registering */ - err = register_fentry(tr, im->image); + err = ops->register_fentry(tr, im, data); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if (err == -EAGAIN) { @@ -704,34 +767,31 @@ again: goto again; } #endif - if (err) - goto out_free; - if (tr->cur_image) - bpf_tramp_image_put(tr->cur_image); - tr->cur_image = im; +out_free: + if (err) + bpf_tramp_image_free(im); out: /* If any error happens, restore previous flags */ if (err) tr->flags = orig_flags; - kfree(tlinks); + kfree(tnodes); return err; - -out_free: - bpf_tramp_image_free(im); - goto out; } static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) { switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: + case BPF_TRACE_FENTRY_MULTI: return BPF_TRAMP_FENTRY; case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: + case BPF_TRACE_FEXIT_MULTI: return BPF_TRAMP_FEXIT; case BPF_TRACE_FSESSION: + case BPF_TRACE_FSESSION_MULTI: return BPF_TRAMP_FSESSION; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) @@ -764,39 +824,33 @@ static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog) return 0; } -static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, - struct bpf_trampoline *tr, - struct bpf_prog *tgt_prog) +static struct bpf_tramp_node *fsession_exit(struct bpf_tramp_node *node) { - struct bpf_fsession_link *fslink = NULL; - enum bpf_tramp_prog_type kind; - struct bpf_tramp_link *link_exiting; - struct hlist_head *prog_list; - int err = 0; - int cnt = 0, i; + if (node->link->type == BPF_LINK_TYPE_TRACING) { + struct bpf_tracing_link *link; - kind = bpf_attach_type_to_tramp(link->link.prog); - if (tr->extension_prog) - /* cannot attach fentry/fexit if extension prog is attached. - * cannot overwrite extension prog either. - */ - return -EBUSY; + link = container_of(node->link, struct bpf_tracing_link, link.link); + return &link->fexit; + } else if (node->link->type == BPF_LINK_TYPE_TRACING_MULTI) { + struct bpf_tracing_multi_link *link; + struct bpf_tracing_multi_node *mnode; - for (i = 0; i < BPF_TRAMP_MAX; i++) - cnt += tr->progs_cnt[i]; - - if (kind == BPF_TRAMP_REPLACE) { - /* Cannot attach extension if fentry/fexit are in use. */ - if (cnt) - return -EBUSY; - err = bpf_freplace_check_tgt_prog(tgt_prog); - if (err) - return err; - tr->extension_prog = link->link.prog; - return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, - BPF_MOD_JUMP, NULL, - link->link.prog->bpf_func); + link = container_of(node->link, struct bpf_tracing_multi_link, link); + mnode = container_of(node, struct bpf_tracing_multi_node, node); + return &link->fexits[mnode - link->nodes]; } + return NULL; +} + +static int bpf_trampoline_add_prog(struct bpf_trampoline *tr, + struct bpf_tramp_node *node, + int cnt) +{ + enum bpf_tramp_prog_type kind; + struct bpf_tramp_node *node_existing, *fexit; + struct hlist_head *prog_list; + + kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_FSESSION) { prog_list = &tr->progs_hlist[BPF_TRAMP_FENTRY]; cnt++; @@ -805,59 +859,112 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; - if (!hlist_unhashed(&link->tramp_hlist)) + if (!hlist_unhashed(&node->tramp_hlist)) /* prog already linked */ return -EBUSY; - hlist_for_each_entry(link_exiting, prog_list, tramp_hlist) { - if (link_exiting->link.prog != link->link.prog) + hlist_for_each_entry(node_existing, prog_list, tramp_hlist) { + if (node_existing->link->prog != node->link->prog) continue; /* prog already linked */ return -EBUSY; } - hlist_add_head(&link->tramp_hlist, prog_list); + hlist_add_head(&node->tramp_hlist, prog_list); if (kind == BPF_TRAMP_FSESSION) { tr->progs_cnt[BPF_TRAMP_FENTRY]++; - fslink = container_of(link, struct bpf_fsession_link, link.link); - hlist_add_head(&fslink->fexit.tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); + fexit = fsession_exit(node); + if (WARN_ON_ONCE(!fexit)) + return -EINVAL; + hlist_add_head(&fexit->tramp_hlist, &tr->progs_hlist[BPF_TRAMP_FEXIT]); tr->progs_cnt[BPF_TRAMP_FEXIT]++; } else { tr->progs_cnt[kind]++; } - err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); - if (err) { - hlist_del_init(&link->tramp_hlist); - if (kind == BPF_TRAMP_FSESSION) { - tr->progs_cnt[BPF_TRAMP_FENTRY]--; - hlist_del_init(&fslink->fexit.tramp_hlist); - tr->progs_cnt[BPF_TRAMP_FEXIT]--; - } else { - tr->progs_cnt[kind]--; - } + return 0; +} + +static void bpf_trampoline_remove_prog(struct bpf_trampoline *tr, + struct bpf_tramp_node *node) +{ + enum bpf_tramp_prog_type kind; + struct bpf_tramp_node *fexit; + + kind = bpf_attach_type_to_tramp(node->link->prog); + if (kind == BPF_TRAMP_FSESSION) { + fexit = fsession_exit(node); + if (WARN_ON_ONCE(!fexit)) + return; + hlist_del_init(&fexit->tramp_hlist); + tr->progs_cnt[BPF_TRAMP_FEXIT]--; + kind = BPF_TRAMP_FENTRY; } + hlist_del_init(&node->tramp_hlist); + tr->progs_cnt[kind]--; +} + +static int __bpf_trampoline_link_prog(struct bpf_tramp_node *node, + struct bpf_trampoline *tr, + struct bpf_prog *tgt_prog, + const struct bpf_trampoline_ops *ops, + void *data) +{ + enum bpf_tramp_prog_type kind; + int err = 0; + int cnt = 0, i; + + kind = bpf_attach_type_to_tramp(node->link->prog); + if (tr->extension_prog) + /* cannot attach fentry/fexit if extension prog is attached. + * cannot overwrite extension prog either. + */ + return -EBUSY; + + for (i = 0; i < BPF_TRAMP_MAX; i++) + cnt += tr->progs_cnt[i]; + + if (kind == BPF_TRAMP_REPLACE) { + /* Cannot attach extension if fentry/fexit are in use. */ + if (cnt) + return -EBUSY; + err = bpf_freplace_check_tgt_prog(tgt_prog); + if (err) + return err; + tr->extension_prog = node->link->prog; + return bpf_arch_text_poke(tr->func.addr, BPF_MOD_NOP, + BPF_MOD_JUMP, NULL, + node->link->prog->bpf_func); + } + err = bpf_trampoline_add_prog(tr, node, cnt); + if (err) + return err; + err = bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); + if (err) + bpf_trampoline_remove_prog(tr, node); return err; } -int bpf_trampoline_link_prog(struct bpf_tramp_link *link, +int bpf_trampoline_link_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { int err; - mutex_lock(&tr->mutex); - err = __bpf_trampoline_link_prog(link, tr, tgt_prog); - mutex_unlock(&tr->mutex); + trampoline_lock(tr); + err = __bpf_trampoline_link_prog(node, tr, tgt_prog, &trampoline_ops, NULL); + trampoline_unlock(tr); return err; } -static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +static int __bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, - struct bpf_prog *tgt_prog) + struct bpf_prog *tgt_prog, + const struct bpf_trampoline_ops *ops, + void *data) { enum bpf_tramp_prog_type kind; int err; - kind = bpf_attach_type_to_tramp(link->link.prog); + kind = bpf_attach_type_to_tramp(node->link->prog); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, @@ -867,29 +974,21 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, guard(mutex)(&tgt_prog->aux->ext_mutex); tgt_prog->aux->is_extended = false; return err; - } else if (kind == BPF_TRAMP_FSESSION) { - struct bpf_fsession_link *fslink = - container_of(link, struct bpf_fsession_link, link.link); - - hlist_del_init(&fslink->fexit.tramp_hlist); - tr->progs_cnt[BPF_TRAMP_FEXIT]--; - kind = BPF_TRAMP_FENTRY; } - hlist_del_init(&link->tramp_hlist); - tr->progs_cnt[kind]--; - return bpf_trampoline_update(tr, true /* lock_direct_mutex */); + bpf_trampoline_remove_prog(tr, node); + return bpf_trampoline_update(tr, true /* lock_direct_mutex */, ops, data); } /* bpf_trampoline_unlink_prog() should never fail. */ -int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, +int bpf_trampoline_unlink_prog(struct bpf_tramp_node *node, struct bpf_trampoline *tr, struct bpf_prog *tgt_prog) { int err; - mutex_lock(&tr->mutex); - err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog); - mutex_unlock(&tr->mutex); + trampoline_lock(tr); + err = __bpf_trampoline_unlink_prog(node, tr, tgt_prog, &trampoline_ops, NULL); + trampoline_unlock(tr); return err; } @@ -903,7 +1002,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link) if (!shim_link->trampoline) return; - WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL)); + WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link.node, shim_link->trampoline, NULL)); bpf_trampoline_put(shim_link->trampoline); } @@ -949,8 +1048,8 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog p->type = BPF_PROG_TYPE_LSM; p->expected_attach_type = BPF_LSM_MAC; bpf_prog_inc(p); - bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, - &bpf_shim_tramp_link_lops, p, attach_type); + bpf_tramp_link_init(&shim_link->link, BPF_LINK_TYPE_UNSPEC, + &bpf_shim_tramp_link_lops, p, attach_type, 0); bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; @@ -959,15 +1058,15 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, bpf_func_t bpf_func) { - struct bpf_tramp_link *link; + struct bpf_tramp_node *node; int kind; for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { - hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { - struct bpf_prog *p = link->link.prog; + hlist_for_each_entry(node, &tr->progs_hlist[kind], tramp_hlist) { + struct bpf_prog *p = node->link->prog; if (p->bpf_func == bpf_func) - return container_of(link, struct bpf_shim_tramp_link, link); + return container_of(node, struct bpf_shim_tramp_link, link.node); } } @@ -999,12 +1098,12 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, if (!tr) return -ENOMEM; - mutex_lock(&tr->mutex); + trampoline_lock(tr); shim_link = cgroup_shim_find(tr, bpf_func); if (shim_link && !IS_ERR(bpf_link_inc_not_zero(&shim_link->link.link))) { /* Reusing existing shim attached by the other program. */ - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return 0; } @@ -1017,23 +1116,23 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, goto err; } - err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL); + err = __bpf_trampoline_link_prog(&shim_link->link.node, tr, NULL, &trampoline_ops, NULL); if (err) goto err; shim_link->trampoline = tr; /* note, we're still holding tr refcnt from above */ - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return 0; err: - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); if (shim_link) bpf_link_put(&shim_link->link.link); - /* have to release tr while _not_ holding its mutex */ + /* have to release tr while _not_ holding pool mutex for trampoline */ bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return err; @@ -1054,9 +1153,9 @@ void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) if (WARN_ON_ONCE(!tr)) return; - mutex_lock(&tr->mutex); + trampoline_lock(tr); shim_link = cgroup_shim_find(tr, bpf_func); - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); if (shim_link) bpf_link_put(&shim_link->link.link); @@ -1074,14 +1173,14 @@ struct bpf_trampoline *bpf_trampoline_get(u64 key, if (!tr) return NULL; - mutex_lock(&tr->mutex); + trampoline_lock(tr); if (tr->func.addr) goto out; memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); tr->func.addr = (void *)tgt_info->tgt_addr; out: - mutex_unlock(&tr->mutex); + trampoline_unlock(tr); return tr; } @@ -1094,7 +1193,6 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) mutex_lock(&trampoline_mutex); if (!refcount_dec_and_test(&tr->refcnt)) goto out; - WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); for (i = 0; i < BPF_TRAMP_MAX; i++) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i]))) @@ -1333,7 +1431,7 @@ bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, + struct bpf_tramp_nodes *tnodes, void *func_addr) { return -ENOTSUPP; @@ -1367,11 +1465,288 @@ int __weak arch_protect_bpf_trampoline(void *image, unsigned int size) } int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, - struct bpf_tramp_links *tlinks, void *func_addr) + struct bpf_tramp_nodes *tnodes, void *func_addr) { return -ENOTSUPP; } +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \ + defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS) && \ + defined(CONFIG_BPF_SYSCALL) + +static void trampoline_lock_all(void) +{ + int i; + + for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++) + mutex_lock(&trampoline_locks[i].mutex); +} + +static void trampoline_unlock_all(void) +{ + int i; + + for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++) + mutex_unlock(&trampoline_locks[i].mutex); +} + +static void remove_tracing_multi_data(struct bpf_tracing_multi_data *data) +{ + ftrace_hash_remove(data->reg); + ftrace_hash_remove(data->unreg); + ftrace_hash_remove(data->modify); +} + +static void clear_tracing_multi_data(struct bpf_tracing_multi_data *data) +{ + remove_tracing_multi_data(data); + + free_ftrace_hash(data->reg); + free_ftrace_hash(data->unreg); + free_ftrace_hash(data->modify); +} + +static int init_tracing_multi_data(struct bpf_tracing_multi_data *data) +{ + data->reg = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + data->unreg = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + data->modify = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + + if (!data->reg || !data->unreg || !data->modify) { + clear_tracing_multi_data(data); + return -ENOMEM; + } + return 0; +} + +static void ftrace_hash_add(struct ftrace_hash *hash, struct ftrace_func_entry *entry, + unsigned long ip, unsigned long direct) +{ + entry->ip = ip; + entry->direct = direct; + add_ftrace_hash_entry(hash, entry); +} + +static int register_fentry_multi(struct bpf_trampoline *tr, struct bpf_tramp_image *im, void *ptr) +{ + unsigned long addr = (unsigned long) im->image; + unsigned long ip = ftrace_location(tr->ip); + struct bpf_tracing_multi_data *data = ptr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ftrace_hash_add(data->reg, data->entry, ip, addr); + tr->cur_image = im; + return 0; +} + +static int unregister_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, void *ptr) +{ + unsigned long addr = (unsigned long) tr->cur_image->image; + unsigned long ip = ftrace_location(tr->ip); + struct bpf_tracing_multi_data *data = ptr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ftrace_hash_add(data->unreg, data->entry, ip, addr); + tr->cur_image = NULL; + return 0; +} + +static int modify_fentry_multi(struct bpf_trampoline *tr, u32 orig_flags, struct bpf_tramp_image *im, + bool lock_direct_mutex, void *ptr) +{ + unsigned long addr = (unsigned long) im->image; + unsigned long ip = ftrace_location(tr->ip); + struct bpf_tracing_multi_data *data = ptr; + + if (bpf_trampoline_use_jmp(tr->flags)) + addr = ftrace_jmp_set(addr); + + ftrace_hash_add(data->modify, data->entry, ip, addr); + tr->cur_image = im; + return 0; +} + +static const struct bpf_trampoline_ops trampoline_multi_ops = { + .register_fentry = register_fentry_multi, + .unregister_fentry = unregister_fentry_multi, + .modify_fentry = modify_fentry_multi, +}; + +static void bpf_trampoline_multi_attach_init(struct bpf_trampoline *tr) +{ + tr->multi_attach.old_image = tr->cur_image; + tr->multi_attach.old_flags = tr->flags; +} + +static void bpf_trampoline_multi_attach_free(struct bpf_trampoline *tr) +{ + if (tr->multi_attach.old_image) + bpf_tramp_image_put(tr->multi_attach.old_image); + + tr->multi_attach.old_image = NULL; + tr->multi_attach.old_flags = 0; +} + +static void bpf_trampoline_multi_attach_rollback(struct bpf_trampoline *tr) +{ + if (tr->cur_image) + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = tr->multi_attach.old_image; + tr->flags = tr->multi_attach.old_flags; + + tr->multi_attach.old_image = NULL; + tr->multi_attach.old_flags = 0; +} + +#define for_each_mnode_cnt(mnode, link, cnt) \ + for (i = 0, mnode = &link->nodes[i]; i < cnt; i++, mnode = &link->nodes[i]) + +#define for_each_mnode(mnode, link) \ + for_each_mnode_cnt(mnode, link, link->nodes_cnt) + +int bpf_trampoline_multi_attach(struct bpf_prog *prog, u32 *ids, + struct bpf_tracing_multi_link *link) +{ + struct bpf_tracing_multi_data *data = &link->data; + struct bpf_attach_target_info tgt_info = {}; + struct btf *btf = prog->aux->attach_btf; + struct bpf_tracing_multi_node *mnode; + struct bpf_trampoline *tr; + int i, err, rollback_cnt; + u64 key; + + for_each_mnode(mnode, link) { + rollback_cnt = i; + + err = bpf_check_attach_btf_id_multi(btf, prog, ids[i], &tgt_info); + if (err) + goto rollback_put; + + key = bpf_trampoline_compute_key(NULL, btf, ids[i]); + + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) { + err = -ENOMEM; + goto rollback_put; + } + + mnode->trampoline = tr; + mnode->node.link = &link->link; + mnode->node.cookie = link->cookies ? link->cookies[i] : 0; + + if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) { + link->fexits[i].link = &link->link; + link->fexits[i].cookie = link->cookies ? link->cookies[i] : 0; + } + + cond_resched(); + } + + err = init_tracing_multi_data(data); + if (err) { + rollback_cnt = link->nodes_cnt; + goto rollback_put; + } + + trampoline_lock_all(); + + for_each_mnode(mnode, link) { + bpf_trampoline_multi_attach_init(mnode->trampoline); + + data->entry = &mnode->entry; + err = __bpf_trampoline_link_prog(&mnode->node, mnode->trampoline, NULL, + &trampoline_multi_ops, data); + if (err) { + rollback_cnt = i; + goto rollback_unlink; + } + } + + rollback_cnt = link->nodes_cnt; + if (ftrace_hash_count(data->reg)) { + err = update_ftrace_direct_add(&direct_ops, data->reg); + if (err) + goto rollback_unlink; + } + + if (ftrace_hash_count(data->modify)) { + err = update_ftrace_direct_mod(&direct_ops, data->modify, true); + if (err) { + if (ftrace_hash_count(data->reg)) + WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->reg)); + goto rollback_unlink; + } + } + + for_each_mnode(mnode, link) + bpf_trampoline_multi_attach_free(mnode->trampoline); + + trampoline_unlock_all(); + + remove_tracing_multi_data(data); + return 0; + +rollback_unlink: + for_each_mnode_cnt(mnode, link, rollback_cnt) { + bpf_trampoline_remove_prog(mnode->trampoline, &mnode->node); + bpf_trampoline_multi_attach_rollback(mnode->trampoline); + } + + trampoline_unlock_all(); + + clear_tracing_multi_data(data); + rollback_cnt = link->nodes_cnt; + +rollback_put: + for_each_mnode_cnt(mnode, link, rollback_cnt) + bpf_trampoline_put(mnode->trampoline); + + return err; +} + +int bpf_trampoline_multi_detach(struct bpf_prog *prog, struct bpf_tracing_multi_link *link) +{ + struct bpf_tracing_multi_data *data = &link->data; + struct bpf_tracing_multi_node *mnode; + int i; + + trampoline_lock_all(); + + for_each_mnode(mnode, link) { + data->entry = &mnode->entry; + bpf_trampoline_multi_attach_init(mnode->trampoline); + WARN_ON_ONCE(__bpf_trampoline_unlink_prog(&mnode->node, mnode->trampoline, + NULL, &trampoline_multi_ops, data)); + } + + if (ftrace_hash_count(data->unreg)) + WARN_ON_ONCE(update_ftrace_direct_del(&direct_ops, data->unreg)); + if (ftrace_hash_count(data->modify)) + WARN_ON_ONCE(update_ftrace_direct_mod(&direct_ops, data->modify, true)); + + for_each_mnode(mnode, link) + bpf_trampoline_multi_attach_free(mnode->trampoline); + + trampoline_unlock_all(); + + for_each_mnode(mnode, link) + bpf_trampoline_put(mnode->trampoline); + + clear_tracing_multi_data(data); + return 0; +} + +#undef for_each_mnode_cnt +#undef for_each_mnode + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && + CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS && + CONFIG_BPF_SYSCALL */ + static int __init init_trampolines(void) { int i; @@ -1380,6 +1755,8 @@ static int __init init_trampolines(void) INIT_HLIST_HEAD(&trampoline_key_table[i]); for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) INIT_HLIST_HEAD(&trampoline_ip_table[i]); + for (i = 0; i < TRAMPOLINE_LOCKS_TABLE_SIZE; i++) + __mutex_init(&trampoline_locks[i].mutex, "trampoline_lock", &trampoline_locks[i].key); return 0; } late_initcall(init_trampolines); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 69d75515ed3f..2abc79dbf281 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -26,6 +26,7 @@ #include <linux/poison.h> #include <linux/module.h> #include <linux/cpumask.h> +#include <linux/cnum.h> #include <linux/bpf_mem_alloc.h> #include <net/xdp.h> #include <linux/trace_events.h> @@ -199,14 +200,15 @@ struct bpf_verifier_stack_elem { #define BPF_PRIV_STACK_MIN_SIZE 64 -static int acquire_reference(struct bpf_verifier_env *env, int insn_idx); -static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id); -static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, int parent_id); +static int release_reference_nomark(struct bpf_verifier_state *state, int id); +static int release_reference(struct bpf_verifier_env *env, int id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); +static bool is_tracing_prog_type(enum bpf_prog_type type); static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg); -static bool is_trusted_reg(const struct bpf_reg_state *reg); +static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg); static inline bool in_sleepable_context(struct bpf_verifier_env *env); static const char *non_sleepable_context_description(struct bpf_verifier_env *env); static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg); @@ -230,8 +232,28 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } +static void update_ref_obj(struct ref_obj_desc *ref_obj, struct bpf_reg_state *reg) +{ + ref_obj->id = reg->id; + ref_obj->parent_id = reg->parent_id; + ref_obj->cnt++; +} + +static int validate_ref_obj(struct bpf_verifier_env *env, struct ref_obj_desc *ref_obj) +{ + if (ref_obj->cnt > 1) { + verifier_bug(env, "function expects only one referenced object but got %d\n", + ref_obj->cnt); + return -EFAULT; + } + + return 0; +} + struct bpf_call_arg_meta { struct bpf_map_desc map; + struct bpf_dynptr_desc dynptr; + struct ref_obj_desc ref_obj; bool raw_mode; bool pkt_access; u8 release_regno; @@ -239,8 +261,6 @@ struct bpf_call_arg_meta { int access_size; int mem_size; u64 msize_max_value; - int ref_obj_id; - int dynptr_id; int func_id; struct btf *btf; u32 btf_id; @@ -261,6 +281,41 @@ struct bpf_kfunc_meta { struct btf *btf_vmlinux; +typedef struct argno { + int argno; +} argno_t; + +static argno_t argno_from_reg(u32 regno) +{ + return (argno_t){ .argno = regno }; +} + +static argno_t argno_from_arg(u32 arg) +{ + return (argno_t){ .argno = -arg }; +} + +static int reg_from_argno(argno_t a) +{ + if (a.argno >= 0) + return a.argno; + if (a.argno >= -MAX_BPF_FUNC_REG_ARGS) + return -a.argno; + return -1; +} + +static int arg_from_argno(argno_t a) +{ + if (a.argno < 0) + return -a.argno; + return -1; +} + +static int arg_idx_from_argno(argno_t a) +{ + return arg_from_argno(a) - 1; +} + static const char *btf_type_name(const struct btf *btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); @@ -290,12 +345,12 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, bool unknown = true; verbose(env, "%s the register %s has", ctx, reg_name); - if (reg->smin_value > S64_MIN) { - verbose(env, " smin=%lld", reg->smin_value); + if (reg_smin(reg) > S64_MIN) { + verbose(env, " smin=%lld", reg_smin(reg)); unknown = false; } - if (reg->smax_value < S64_MAX) { - verbose(env, " smax=%lld", reg->smax_value); + if (reg_smax(reg) < S64_MAX) { + verbose(env, " smax=%lld", reg_smax(reg)); unknown = false; } if (unknown) @@ -303,7 +358,7 @@ static void verbose_invalid_scalar(struct bpf_verifier_env *env, verbose(env, " should have been in [%d, %d]\n", range.minval, range.maxval); } -static bool reg_not_null(const struct bpf_reg_state *reg) +static bool reg_not_null(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { enum bpf_reg_type type; @@ -317,7 +372,7 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MAP_VALUE || type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || - (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || + (type == PTR_TO_BTF_ID && is_trusted_reg(env, reg)) || (type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) || type == CONST_PTR_TO_MAP; } @@ -434,15 +489,9 @@ static bool is_ptr_cast_function(enum bpf_func_id func_id) func_id == BPF_FUNC_skc_to_tcp_request_sock; } -static bool is_dynptr_ref_function(enum bpf_func_id func_id) -{ - return func_id == BPF_FUNC_dynptr_data; -} - static bool is_sync_callback_calling_kfunc(u32 btf_id); static bool is_async_callback_calling_kfunc(u32 btf_id); static bool is_callback_calling_kfunc(u32 btf_id); -static bool is_bpf_throw_kfunc(struct bpf_insn *insn); static bool is_bpf_wq_set_callback_kfunc(u32 btf_id); static bool is_task_work_add_kfunc(u32 func_id); @@ -498,22 +547,6 @@ bool bpf_is_may_goto_insn(struct bpf_insn *insn) return insn->code == (BPF_JMP | BPF_JCOND) && insn->src_reg == BPF_MAY_GOTO; } -static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, - const struct bpf_map *map) -{ - int ref_obj_uses = 0; - - if (is_ptr_cast_function(func_id)) - ref_obj_uses++; - if (is_acquire_function(func_id, map)) - ref_obj_uses++; - if (is_dynptr_ref_function(func_id)) - ref_obj_uses++; - - return ref_obj_uses > 1; -} - - static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots) { int allocated_slots = state->allocated_stack / BPF_REG_SIZE; @@ -610,43 +643,44 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) } } -static bool dynptr_type_refcounted(enum bpf_dynptr_type type) +static bool dynptr_type_referenced(enum bpf_dynptr_type type) { return type == BPF_DYNPTR_TYPE_RINGBUF || type == BPF_DYNPTR_TYPE_FILE; } static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, - bool first_slot, int dynptr_id); + bool first_slot, int id, int parent_id); static void mark_dynptr_stack_regs(struct bpf_verifier_env *env, struct bpf_reg_state *sreg1, struct bpf_reg_state *sreg2, - enum bpf_dynptr_type type) + enum bpf_dynptr_type type, int parent_id) { int id = ++env->id_gen; - __mark_dynptr_reg(sreg1, type, true, id); - __mark_dynptr_reg(sreg2, type, false, id); + __mark_dynptr_reg(sreg1, type, true, id, parent_id); + __mark_dynptr_reg(sreg2, type, false, id, parent_id); } static void mark_dynptr_cb_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_dynptr_type type) { - __mark_dynptr_reg(reg, type, true, ++env->id_gen); + __mark_dynptr_reg(reg, type, true, ++env->id_gen, 0); } static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi); static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - enum bpf_arg_type arg_type, int insn_idx, int clone_ref_obj_id) + enum bpf_arg_type arg_type, int insn_idx, + struct ref_obj_desc *ref_obj, struct bpf_dynptr_desc *dynptr) { struct bpf_func_state *state = bpf_func(env, reg); + int spi, i, err, parent_id = 0; enum bpf_dynptr_type type; - int spi, i, err; spi = dynptr_get_spi(env, reg); if (spi < 0) @@ -677,94 +711,69 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL; - mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr, - &state->stack[spi - 1].spilled_ptr, type); + if (dynptr->type == BPF_DYNPTR_TYPE_INVALID) { /* dynptr constructors */ + err = validate_ref_obj(env, ref_obj); + if (err) + return err; - if (dynptr_type_refcounted(type)) { - /* The id is used to track proper releasing */ - int id; + /* Track parent's id if the parent is a referenced object */ + parent_id = ref_obj->id; - if (clone_ref_obj_id) - id = clone_ref_obj_id; - else - id = acquire_reference(env, insn_idx); + if (dynptr_type_referenced(type)) { + int id; - if (id < 0) - return id; + /* + * Create an intermediate reference that tracks the referenced + * object for the referenced dynptr. Freeing a referenced dynptr + * through helpers/kfuncs will invalidate all clones. + */ + id = acquire_reference(env, insn_idx, parent_id); + if (id < 0) + return id; - state->stack[spi].spilled_ptr.ref_obj_id = id; - state->stack[spi - 1].spilled_ptr.ref_obj_id = id; + parent_id = id; + } + } else { /* bpf_dynptr_clone() */ + parent_id = dynptr->parent_id; } + mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr, + &state->stack[spi - 1].spilled_ptr, type, parent_id); + return 0; } -static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) +static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_stack_state *stack) { int i; for (i = 0; i < BPF_REG_SIZE; i++) { - state->stack[spi].slot_type[i] = STACK_INVALID; - state->stack[spi - 1].slot_type[i] = STACK_INVALID; + stack[0].slot_type[i] = STACK_INVALID; + stack[1].slot_type[i] = STACK_INVALID; } - bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr); - bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); + bpf_mark_reg_not_init(env, &stack[0].spilled_ptr); + bpf_mark_reg_not_init(env, &stack[1].spilled_ptr); } static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = bpf_func(env, reg); - int spi, ref_obj_id, i; + int spi; - /* - * This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot - * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr - * is safe to do directly. - */ - if (reg->type == CONST_PTR_TO_DYNPTR) { - verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); - return -EFAULT; - } spi = dynptr_get_spi(env, reg); if (spi < 0) return spi; - if (!dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - invalidate_dynptr(env, state, spi); - return 0; - } - - ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id; - - /* If the dynptr has a ref_obj_id, then we need to invalidate - * two things: - * - * 1) Any dynptrs with a matching ref_obj_id (clones) - * 2) Any slices derived from this dynptr. + /* + * For referenced dynptr, release the parent ref which cascades to + * all clones and derived slices. For non-referenced dynptr, only + * the dynptr and slices derived from it will be invalidated. */ - - /* Invalidate any slices associated with this dynptr */ - WARN_ON_ONCE(release_reference(env, ref_obj_id)); - - /* Invalidate any dynptr clones */ - for (i = 1; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].spilled_ptr.ref_obj_id != ref_obj_id) - continue; - - /* it should always be the case that if the ref obj id - * matches then the stack slot also belongs to a - * dynptr - */ - if (state->stack[i].slot_type[0] != STACK_DYNPTR) { - verifier_bug(env, "misconfigured ref_obj_id"); - return -EFAULT; - } - if (state->stack[i].spilled_ptr.dynptr.first_slot) - invalidate_dynptr(env, state, i); - } - - return 0; + reg = &state->stack[spi].spilled_ptr; + return release_reference(env, dynptr_type_referenced(reg->dynptr.type) + ? reg->parent_id + : reg->id); } static void __mark_reg_unknown(const struct bpf_verifier_env *env, @@ -778,12 +787,29 @@ static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_ __mark_reg_unknown(env, reg); } +static int dynptr_ref_cnt(struct bpf_verifier_env *env, int v_parent_id) +{ + struct bpf_stack_state *stack; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int ref_cnt = 0; + + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, 1 << STACK_DYNPTR, ({ + if (!stack || stack->slot_type[0] != STACK_DYNPTR) + continue; + if (!stack->spilled_ptr.dynptr.first_slot) + continue; + if (stack->spilled_ptr.parent_id == v_parent_id) + ref_cnt++; + })); + + return ref_cnt; +} + static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi) { - struct bpf_func_state *fstate; - struct bpf_reg_state *dreg; - int i, dynptr_id; + int err = 0; /* We always ensure that STACK_DYNPTR is never set partially, * hence just checking for slot_type[0] is enough. This is @@ -797,56 +823,25 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, if (!state->stack[spi].spilled_ptr.dynptr.first_slot) spi = spi + 1; - if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - int ref_obj_id = state->stack[spi].spilled_ptr.ref_obj_id; - int ref_cnt = 0; - - /* - * A referenced dynptr can be overwritten only if there is at - * least one other dynptr sharing the same ref_obj_id, - * ensuring the reference can still be properly released. - */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_DYNPTR) - continue; - if (!state->stack[i].spilled_ptr.dynptr.first_slot) - continue; - if (state->stack[i].spilled_ptr.ref_obj_id == ref_obj_id) - ref_cnt++; - } - - if (ref_cnt <= 1) { - verbose(env, "cannot overwrite referenced dynptr\n"); - return -EINVAL; - } + /* + * A referenced dynptr can be overwritten only if there is at + * least one other dynptr sharing the same virtual ref parent, + * ensuring the reference can still be properly released. + */ + if (dynptr_type_referenced(state->stack[spi].spilled_ptr.dynptr.type) && + dynptr_ref_cnt(env, state->stack[spi].spilled_ptr.parent_id) <= 1) { + verbose(env, "cannot overwrite referenced dynptr\n"); + return -EINVAL; } - mark_stack_slot_scratched(env, spi); - mark_stack_slot_scratched(env, spi - 1); - - /* Writing partially to one dynptr stack slot destroys both. */ - for (i = 0; i < BPF_REG_SIZE; i++) { - state->stack[spi].slot_type[i] = STACK_INVALID; - state->stack[spi - 1].slot_type[i] = STACK_INVALID; + /* Invalidate the dynptr and any derived slices */ + err = release_reference(env, state->stack[spi].spilled_ptr.id); + if (!err) { + mark_stack_slot_scratched(env, spi); + mark_stack_slot_scratched(env, spi - 1); } - dynptr_id = state->stack[spi].spilled_ptr.id; - /* Invalidate any slices associated with this dynptr */ - bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({ - /* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */ - if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM) - continue; - if (dreg->dynptr_id == dynptr_id) - mark_reg_invalid(env, dreg); - })); - - /* Do not release reference state, we are destroying dynptr on stack, - * not using some helper to release it. Just reset register. - */ - bpf_mark_reg_not_init(env, &state->stack[spi].spilled_ptr); - bpf_mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - - return 0; + return err; } static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) @@ -946,7 +941,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, if (spi < 0) return spi; - id = acquire_reference(env, insn_idx); + id = acquire_reference(env, insn_idx, 0); if (id < 0) return id; @@ -962,7 +957,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, else st->type |= PTR_UNTRUSTED; } - st->ref_obj_id = i == 0 ? id : 0; + st->id = i == 0 ? id : 0; st->iter.btf = btf; st->iter.btf_id = btf_id; st->iter.state = BPF_ITER_STATE_ACTIVE; @@ -992,7 +987,7 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, struct bpf_reg_state *st = &slot->spilled_ptr; if (i == 0) - WARN_ON_ONCE(release_reference(env, st->ref_obj_id)); + WARN_ON_ONCE(release_reference(env, st->id)); bpf_mark_reg_not_init(env, st); @@ -1048,10 +1043,10 @@ static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_s if (st->type & PTR_UNTRUSTED) return -EPROTO; - /* only main (first) slot has ref_obj_id set */ - if (i == 0 && !st->ref_obj_id) + /* only main (first) slot has id set */ + if (i == 0 && !st->id) return -EINVAL; - if (i != 0 && st->ref_obj_id) + if (i != 0 && st->id) return -EINVAL; if (st->iter.btf != btf || st->iter.btf_id != btf_id) return -EINVAL; @@ -1090,7 +1085,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ - st->ref_obj_id = id; + st->id = id; st->irq.kfunc_class = kfunc_class; for (i = 0; i < BPF_REG_SIZE; i++) @@ -1124,7 +1119,7 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r return -EINVAL; } - err = release_irq_state(env->cur_state, st->ref_obj_id); + err = release_irq_state(env->cur_state, st->id); WARN_ON_ONCE(err && err != -EACCES); if (err) { int insn_idx = 0; @@ -1188,7 +1183,7 @@ static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_r slot = &state->stack[spi]; st = &slot->spilled_ptr; - if (!st->ref_obj_id) + if (!st->id) return -EINVAL; for (i = 0; i < BPF_REG_SIZE; i++) @@ -1340,6 +1335,18 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st return -ENOMEM; dst->allocated_stack = src->allocated_stack; + + /* copy stack args state */ + n = src->out_stack_arg_cnt; + if (n) { + dst->stack_arg_regs = copy_array(dst->stack_arg_regs, src->stack_arg_regs, n, + sizeof(struct bpf_reg_state), + GFP_KERNEL_ACCOUNT); + if (!dst->stack_arg_regs) + return -ENOMEM; + } + + dst->out_stack_arg_cnt = src->out_stack_arg_cnt; return 0; } @@ -1381,6 +1388,23 @@ static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state return 0; } +static int grow_stack_arg_slots(struct bpf_verifier_env *env, + struct bpf_func_state *state, int cnt) +{ + size_t old_n = state->out_stack_arg_cnt; + + if (old_n >= cnt) + return 0; + + state->stack_arg_regs = realloc_array(state->stack_arg_regs, old_n, cnt, + sizeof(struct bpf_reg_state)); + if (!state->stack_arg_regs) + return -ENOMEM; + + state->out_stack_arg_cnt = cnt; + return 0; +} + /* Acquire a pointer id from the env and update the state->refs to include * this new pointer reference. * On success, returns a valid pointer id to associate with the register @@ -1400,7 +1424,7 @@ static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_e return &state->refs[new_ofs]; } -static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx, int parent_id) { struct bpf_reference_state *s; @@ -1409,6 +1433,7 @@ static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; s->type = REF_TYPE_PTR; s->id = ++env->id_gen; + s->parent_id = parent_id; return s->id; } @@ -1465,17 +1490,25 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx) return; } -static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id) +static bool find_reference_state(struct bpf_verifier_state *state, int id) { int i; - for (i = 0; i < state->acquired_refs; i++) - if (state->refs[i].id == ptr_id) + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].type != REF_TYPE_PTR) + continue; + if (state->refs[i].id == id) return true; + } return false; } +static bool reg_is_referenced(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) +{ + return find_reference_state(env->cur_state, reg->id); +} + static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) { void *prev_ptr = NULL; @@ -1543,6 +1576,7 @@ static void free_func_state(struct bpf_func_state *state) { if (!state) return; + kfree(state->stack_arg_regs); kfree(state->stack); kfree(state); } @@ -1751,6 +1785,22 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, return &elem->st; } +static const char *reg_arg_name(struct bpf_verifier_env *env, argno_t argno) +{ + char *buf = env->tmp_arg_name; + int len = sizeof(env->tmp_arg_name); + int arg, regno = reg_from_argno(argno); + + if (regno >= 0) { + snprintf(buf, len, "R%d", regno); + } else { + arg = arg_from_argno(argno); + snprintf(buf, len, "*(R11-%u)", (arg - MAX_BPF_FUNC_REG_ARGS) * BPF_REG_SIZE); + } + + return buf; +} + static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; @@ -1759,15 +1809,8 @@ static const int caller_saved[CALLER_SAVED_REGS] = { static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const(imm); - reg->smin_value = (s64)imm; - reg->smax_value = (s64)imm; - reg->umin_value = imm; - reg->umax_value = imm; - - reg->s32_min_value = (s32)imm; - reg->s32_max_value = (s32)imm; - reg->u32_min_value = (u32)imm; - reg->u32_max_value = (u32)imm; + reg->r64 = cnum64_from_urange(imm, imm); + reg->r32 = cnum32_from_urange((u32)imm, (u32)imm); } /* Mark the unknown part of a register (variable offset or scalar value) as @@ -1779,17 +1822,14 @@ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) memset(((u8 *)reg) + sizeof(reg->type), 0, offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); reg->id = 0; - reg->ref_obj_id = 0; + reg->parent_id = 0; ___mark_reg_known(reg, imm); } static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm) { reg->var_off = tnum_const_subreg(reg->var_off, imm); - reg->s32_min_value = (s32)imm; - reg->s32_max_value = (s32)imm; - reg->u32_min_value = (u32)imm; - reg->u32_max_value = (u32)imm; + reg->r32 = cnum32_from_urange((u32)imm, (u32)imm); } /* Mark the 'variable offset' part of a register as zero. This should be @@ -1817,7 +1857,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, } static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, - bool first_slot, int dynptr_id) + bool first_slot, int id, int parent_id) { /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply @@ -1826,7 +1866,8 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type ty __mark_reg_known_zero(reg); reg->type = CONST_PTR_TO_DYNPTR; /* Give each dynptr a unique id to uniquely associate slices to it. */ - reg->id = dynptr_id; + reg->id = id; + reg->parent_id = parent_id; reg->dynptr.type = type; reg->dynptr.first_slot = first_slot; } @@ -1900,34 +1941,21 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, tnum_equals_const(reg->var_off, 0); } -/* Reset the min/max bounds of a register */ -static void __mark_reg_unbounded(struct bpf_reg_state *reg) +static void __mark_reg32_unbounded(struct bpf_reg_state *reg) { - reg->smin_value = S64_MIN; - reg->smax_value = S64_MAX; - reg->umin_value = 0; - reg->umax_value = U64_MAX; - - reg->s32_min_value = S32_MIN; - reg->s32_max_value = S32_MAX; - reg->u32_min_value = 0; - reg->u32_max_value = U32_MAX; + reg->r32 = CNUM32_UNBOUNDED; } static void __mark_reg64_unbounded(struct bpf_reg_state *reg) { - reg->smin_value = S64_MIN; - reg->smax_value = S64_MAX; - reg->umin_value = 0; - reg->umax_value = U64_MAX; + reg->r64 = CNUM64_UNBOUNDED; } -static void __mark_reg32_unbounded(struct bpf_reg_state *reg) +/* Reset the min/max bounds of a register */ +static void __mark_reg_unbounded(struct bpf_reg_state *reg) { - reg->s32_min_value = S32_MIN; - reg->s32_max_value = S32_MAX; - reg->u32_min_value = 0; - reg->u32_max_value = U32_MAX; + __mark_reg64_unbounded(reg); + __mark_reg32_unbounded(reg); } static void reset_reg64_and_tnum(struct bpf_reg_state *reg) @@ -1942,19 +1970,32 @@ static void reset_reg32_and_tnum(struct bpf_reg_state *reg) reg->var_off = tnum_unknown; } -static void __update_reg32_bounds(struct bpf_reg_state *reg) +static struct cnum32 cnum32_from_tnum(struct tnum tnum) { - struct tnum var32_off = tnum_subreg(reg->var_off); + tnum = tnum_subreg(tnum); + if ((tnum.mask & S32_MIN) || (tnum.value & S32_MIN)) + /* min signed is max(sign bit) | min(other bits) */ + /* max signed is min(sign bit) | max(other bits) */ + return cnum32_from_srange(tnum.value | (tnum.mask & S32_MIN), + tnum.value | (tnum.mask & S32_MAX)); + else + return cnum32_from_urange(tnum.value, (tnum.value | tnum.mask)); +} - /* min signed is max(sign bit) | min(other bits) */ - reg->s32_min_value = max_t(s32, reg->s32_min_value, - var32_off.value | (var32_off.mask & S32_MIN)); - /* max signed is min(sign bit) | max(other bits) */ - reg->s32_max_value = min_t(s32, reg->s32_max_value, - var32_off.value | (var32_off.mask & S32_MAX)); - reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value); - reg->u32_max_value = min(reg->u32_max_value, - (u32)(var32_off.value | var32_off.mask)); +static struct cnum64 cnum64_from_tnum(struct tnum tnum) +{ + if ((tnum.mask & S64_MIN) || (tnum.value & S64_MIN)) + /* min signed is max(sign bit) | min(other bits) */ + /* max signed is min(sign bit) | max(other bits) */ + return cnum64_from_srange(tnum.value | (tnum.mask & S64_MIN), + tnum.value | (tnum.mask & S64_MAX)); + else + return cnum64_from_urange(tnum.value, (tnum.value | tnum.mask)); +} + +static void __update_reg32_bounds(struct bpf_reg_state *reg) +{ + cnum32_intersect_with(®->r32, cnum32_from_tnum(reg->var_off)); } static void __update_reg64_bounds(struct bpf_reg_state *reg) @@ -1962,26 +2003,18 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) u64 tnum_next, tmax; bool umin_in_tnum; - /* min signed is max(sign bit) | min(other bits) */ - reg->smin_value = max_t(s64, reg->smin_value, - reg->var_off.value | (reg->var_off.mask & S64_MIN)); - /* max signed is min(sign bit) | max(other bits) */ - reg->smax_value = min_t(s64, reg->smax_value, - reg->var_off.value | (reg->var_off.mask & S64_MAX)); - reg->umin_value = max(reg->umin_value, reg->var_off.value); - reg->umax_value = min(reg->umax_value, - reg->var_off.value | reg->var_off.mask); + cnum64_intersect_with(®->r64, cnum64_from_tnum(reg->var_off)); /* Check if u64 and tnum overlap in a single value */ - tnum_next = tnum_step(reg->var_off, reg->umin_value); - umin_in_tnum = (reg->umin_value & ~reg->var_off.mask) == reg->var_off.value; + tnum_next = tnum_step(reg->var_off, reg_umin(reg)); + umin_in_tnum = (reg_umin(reg) & ~reg->var_off.mask) == reg->var_off.value; tmax = reg->var_off.value | reg->var_off.mask; - if (umin_in_tnum && tnum_next > reg->umax_value) { + if (umin_in_tnum && tnum_next > reg_umax(reg)) { /* The u64 range and the tnum only overlap in umin. * u64: ---[xxxxxx]----- * tnum: --xx----------x- */ - ___mark_reg_known(reg, reg->umin_value); + ___mark_reg_known(reg, reg_umin(reg)); } else if (!umin_in_tnum && tnum_next == tmax) { /* The u64 range and the tnum only overlap in the maximum value * represented by the tnum, called tmax. @@ -1989,8 +2022,8 @@ static void __update_reg64_bounds(struct bpf_reg_state *reg) * tnum: xx-----x-------- */ ___mark_reg_known(reg, tmax); - } else if (!umin_in_tnum && tnum_next <= reg->umax_value && - tnum_step(reg->var_off, tnum_next) > reg->umax_value) { + } else if (!umin_in_tnum && tnum_next <= reg_umax(reg) && + tnum_step(reg->var_off, tnum_next) > reg_umax(reg)) { /* The u64 range and the tnum only overlap in between umin * (excluded) and umax. * u64: ---[xxxxxx]----- @@ -2006,329 +2039,19 @@ static void __update_reg_bounds(struct bpf_reg_state *reg) __update_reg64_bounds(reg); } -/* Uses signed min/max values to inform unsigned, and vice-versa */ static void deduce_bounds_32_from_64(struct bpf_reg_state *reg) { - /* If upper 32 bits of u64/s64 range don't change, we can use lower 32 - * bits to improve our u32/s32 boundaries. - * - * E.g., the case where we have upper 32 bits as zero ([10, 20] in - * u64) is pretty trivial, it's obvious that in u32 we'll also have - * [10, 20] range. But this property holds for any 64-bit range as - * long as upper 32 bits in that entire range of values stay the same. - * - * E.g., u64 range [0x10000000A, 0x10000000F] ([4294967306, 4294967311] - * in decimal) has the same upper 32 bits throughout all the values in - * that range. As such, lower 32 bits form a valid [0xA, 0xF] ([10, 15]) - * range. - * - * Note also, that [0xA, 0xF] is a valid range both in u32 and in s32, - * following the rules outlined below about u64/s64 correspondence - * (which equally applies to u32 vs s32 correspondence). In general it - * depends on actual hexadecimal values of 32-bit range. They can form - * only valid u32, or only valid s32 ranges in some cases. - * - * So we use all these insights to derive bounds for subregisters here. - */ - if ((reg->umin_value >> 32) == (reg->umax_value >> 32)) { - /* u64 to u32 casting preserves validity of low 32 bits as - * a range, if upper 32 bits are the same - */ - reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->umin_value); - reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->umax_value); - - if ((s32)reg->umin_value <= (s32)reg->umax_value) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); - } - } - if ((reg->smin_value >> 32) == (reg->smax_value >> 32)) { - /* low 32 bits should form a proper u32 range */ - if ((u32)reg->smin_value <= (u32)reg->smax_value) { - reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)reg->smin_value); - reg->u32_max_value = min_t(u32, reg->u32_max_value, (u32)reg->smax_value); - } - /* low 32 bits should form a proper s32 range */ - if ((s32)reg->smin_value <= (s32)reg->smax_value) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); - } - } - /* Special case where upper bits form a small sequence of two - * sequential numbers (in 32-bit unsigned space, so 0xffffffff to - * 0x00000000 is also valid), while lower bits form a proper s32 range - * going from negative numbers to positive numbers. E.g., let's say we - * have s64 range [-1, 1] ([0xffffffffffffffff, 0x0000000000000001]). - * Possible s64 values are {-1, 0, 1} ({0xffffffffffffffff, - * 0x0000000000000000, 0x00000000000001}). Ignoring upper 32 bits, - * we still get a valid s32 range [-1, 1] ([0xffffffff, 0x00000001]). - * Note that it doesn't have to be 0xffffffff going to 0x00000000 in - * upper 32 bits. As a random example, s64 range - * [0xfffffff0fffffff0; 0xfffffff100000010], forms a valid s32 range - * [-16, 16] ([0xfffffff0; 0x00000010]) in its 32 bit subregister. - */ - if ((u32)(reg->umin_value >> 32) + 1 == (u32)(reg->umax_value >> 32) && - (s32)reg->umin_value < 0 && (s32)reg->umax_value >= 0) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->umin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->umax_value); - } - if ((u32)(reg->smin_value >> 32) + 1 == (u32)(reg->smax_value >> 32) && - (s32)reg->smin_value < 0 && (s32)reg->smax_value >= 0) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, (s32)reg->smin_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, (s32)reg->smax_value); - } -} - -static void deduce_bounds_32_from_32(struct bpf_reg_state *reg) -{ - /* if u32 range forms a valid s32 range (due to matching sign bit), - * try to learn from that - */ - if ((s32)reg->u32_min_value <= (s32)reg->u32_max_value) { - reg->s32_min_value = max_t(s32, reg->s32_min_value, reg->u32_min_value); - reg->s32_max_value = min_t(s32, reg->s32_max_value, reg->u32_max_value); - } - /* If we cannot cross the sign boundary, then signed and unsigned bounds - * are the same, so combine. This works even in the negative case, e.g. - * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. - */ - if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { - reg->u32_min_value = max_t(u32, reg->s32_min_value, reg->u32_min_value); - reg->u32_max_value = min_t(u32, reg->s32_max_value, reg->u32_max_value); - } else { - if (reg->u32_max_value < (u32)reg->s32_min_value) { - /* See __reg64_deduce_bounds() for detailed explanation. - * Refine ranges in the following situation: - * - * 0 U32_MAX - * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxx s32 range xxxxxxxxx] [xxxxxxx| - * 0 S32_MAX S32_MIN -1 - */ - reg->s32_min_value = (s32)reg->u32_min_value; - reg->u32_max_value = min_t(u32, reg->u32_max_value, reg->s32_max_value); - } else if ((u32)reg->s32_max_value < reg->u32_min_value) { - /* - * 0 U32_MAX - * | [xxxxxxxxxxxxxx u32 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxxxxxx] [xxxxxxxxxxxx s32 range | - * 0 S32_MAX S32_MIN -1 - */ - reg->s32_max_value = (s32)reg->u32_max_value; - reg->u32_min_value = max_t(u32, reg->u32_min_value, reg->s32_min_value); - } - } -} - -static void deduce_bounds_64_from_64(struct bpf_reg_state *reg) -{ - /* If u64 range forms a valid s64 range (due to matching sign bit), - * try to learn from that. Let's do a bit of ASCII art to see when - * this is happening. Let's take u64 range first: - * - * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX - * |-------------------------------|--------------------------------| - * - * Valid u64 range is formed when umin and umax are anywhere in the - * range [0, U64_MAX], and umin <= umax. u64 case is simple and - * straightforward. Let's see how s64 range maps onto the same range - * of values, annotated below the line for comparison: - * - * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX - * |-------------------------------|--------------------------------| - * 0 S64_MAX S64_MIN -1 - * - * So s64 values basically start in the middle and they are logically - * contiguous to the right of it, wrapping around from -1 to 0, and - * then finishing as S64_MAX (0x7fffffffffffffff) right before - * S64_MIN. We can try drawing the continuity of u64 vs s64 values - * more visually as mapped to sign-agnostic range of hex values. - * - * u64 start u64 end - * _______________________________________________________________ - * / \ - * 0 0x7fffffffffffffff 0x8000000000000000 U64_MAX - * |-------------------------------|--------------------------------| - * 0 S64_MAX S64_MIN -1 - * / \ - * >------------------------------ -------------------------------> - * s64 continues... s64 end s64 start s64 "midpoint" - * - * What this means is that, in general, we can't always derive - * something new about u64 from any random s64 range, and vice versa. - * - * But we can do that in two particular cases. One is when entire - * u64/s64 range is *entirely* contained within left half of the above - * diagram or when it is *entirely* contained in the right half. I.e.: - * - * |-------------------------------|--------------------------------| - * ^ ^ ^ ^ - * A B C D - * - * [A, B] and [C, D] are contained entirely in their respective halves - * and form valid contiguous ranges as both u64 and s64 values. [A, B] - * will be non-negative both as u64 and s64 (and in fact it will be - * identical ranges no matter the signedness). [C, D] treated as s64 - * will be a range of negative values, while in u64 it will be - * non-negative range of values larger than 0x8000000000000000. - * - * Now, any other range here can't be represented in both u64 and s64 - * simultaneously. E.g., [A, C], [A, D], [B, C], [B, D] are valid - * contiguous u64 ranges, but they are discontinuous in s64. [B, C] - * in s64 would be properly presented as [S64_MIN, C] and [B, S64_MAX], - * for example. Similarly, valid s64 range [D, A] (going from negative - * to positive values), would be two separate [D, U64_MAX] and [0, A] - * ranges as u64. Currently reg_state can't represent two segments per - * numeric domain, so in such situations we can only derive maximal - * possible range ([0, U64_MAX] for u64, and [S64_MIN, S64_MAX] for s64). - * - * So we use these facts to derive umin/umax from smin/smax and vice - * versa only if they stay within the same "half". This is equivalent - * to checking sign bit: lower half will have sign bit as zero, upper - * half have sign bit 1. Below in code we simplify this by just - * casting umin/umax as smin/smax and checking if they form valid - * range, and vice versa. Those are equivalent checks. - */ - if ((s64)reg->umin_value <= (s64)reg->umax_value) { - reg->smin_value = max_t(s64, reg->smin_value, reg->umin_value); - reg->smax_value = min_t(s64, reg->smax_value, reg->umax_value); - } - /* If we cannot cross the sign boundary, then signed and unsigned bounds - * are the same, so combine. This works even in the negative case, e.g. - * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. - */ - if ((u64)reg->smin_value <= (u64)reg->smax_value) { - reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value); - reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value); - } else { - /* If the s64 range crosses the sign boundary, then it's split - * between the beginning and end of the U64 domain. In that - * case, we can derive new bounds if the u64 range overlaps - * with only one end of the s64 range. - * - * In the following example, the u64 range overlaps only with - * positive portion of the s64 range. - * - * 0 U64_MAX - * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxx s64 range xxxxxxxxx] [xxxxxxx| - * 0 S64_MAX S64_MIN -1 - * - * We can thus derive the following new s64 and u64 ranges. - * - * 0 U64_MAX - * | [xxxxxx u64 range xxxxx] | - * |----------------------------|----------------------------| - * | [xxxxxx s64 range xxxxx] | - * 0 S64_MAX S64_MIN -1 - * - * If they overlap in two places, we can't derive anything - * because reg_state can't represent two ranges per numeric - * domain. - * - * 0 U64_MAX - * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx| - * 0 S64_MAX S64_MIN -1 - * - * The first condition below corresponds to the first diagram - * above. - */ - if (reg->umax_value < (u64)reg->smin_value) { - reg->smin_value = (s64)reg->umin_value; - reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value); - } else if ((u64)reg->smax_value < reg->umin_value) { - /* This second condition considers the case where the u64 range - * overlaps with the negative portion of the s64 range: - * - * 0 U64_MAX - * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | - * |----------------------------|----------------------------| - * |xxxxxxxxx] [xxxxxxxxxxxx s64 range | - * 0 S64_MAX S64_MIN -1 - */ - reg->smax_value = (s64)reg->umax_value; - reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value); - } - } + cnum32_intersect_with(®->r32, cnum32_from_cnum64(reg->r64)); } static void deduce_bounds_64_from_32(struct bpf_reg_state *reg) { - /* Try to tighten 64-bit bounds from 32-bit knowledge, using 32-bit - * values on both sides of 64-bit range in hope to have tighter range. - * E.g., if r1 is [0x1'00000000, 0x3'80000000], and we learn from - * 32-bit signed > 0 operation that s32 bounds are now [1; 0x7fffffff]. - * With this, we can substitute 1 as low 32-bits of _low_ 64-bit bound - * (0x100000000 -> 0x100000001) and 0x7fffffff as low 32-bits of - * _high_ 64-bit bound (0x380000000 -> 0x37fffffff) and arrive at a - * better overall bounds for r1 as [0x1'000000001; 0x3'7fffffff]. - * We just need to make sure that derived bounds we are intersecting - * with are well-formed ranges in respective s64 or u64 domain, just - * like we do with similar kinds of 32-to-64 or 64-to-32 adjustments. - */ - __u64 new_umin, new_umax; - __s64 new_smin, new_smax; - - /* u32 -> u64 tightening, it's always well-formed */ - new_umin = (reg->umin_value & ~0xffffffffULL) | reg->u32_min_value; - new_umax = (reg->umax_value & ~0xffffffffULL) | reg->u32_max_value; - reg->umin_value = max_t(u64, reg->umin_value, new_umin); - reg->umax_value = min_t(u64, reg->umax_value, new_umax); - /* u32 -> s64 tightening, u32 range embedded into s64 preserves range validity */ - new_smin = (reg->smin_value & ~0xffffffffULL) | reg->u32_min_value; - new_smax = (reg->smax_value & ~0xffffffffULL) | reg->u32_max_value; - reg->smin_value = max_t(s64, reg->smin_value, new_smin); - reg->smax_value = min_t(s64, reg->smax_value, new_smax); - - /* Here we would like to handle a special case after sign extending load, - * when upper bits for a 64-bit range are all 1s or all 0s. - * - * Upper bits are all 1s when register is in a range: - * [0xffff_ffff_0000_0000, 0xffff_ffff_ffff_ffff] - * Upper bits are all 0s when register is in a range: - * [0x0000_0000_0000_0000, 0x0000_0000_ffff_ffff] - * Together this forms are continuous range: - * [0xffff_ffff_0000_0000, 0x0000_0000_ffff_ffff] - * - * Now, suppose that register range is in fact tighter: - * [0xffff_ffff_8000_0000, 0x0000_0000_ffff_ffff] (R) - * Also suppose that it's 32-bit range is positive, - * meaning that lower 32-bits of the full 64-bit register - * are in the range: - * [0x0000_0000, 0x7fff_ffff] (W) - * - * If this happens, then any value in a range: - * [0xffff_ffff_0000_0000, 0xffff_ffff_7fff_ffff] - * is smaller than a lowest bound of the range (R): - * 0xffff_ffff_8000_0000 - * which means that upper bits of the full 64-bit register - * can't be all 1s, when lower bits are in range (W). - * - * Note that: - * - 0xffff_ffff_8000_0000 == (s64)S32_MIN - * - 0x0000_0000_7fff_ffff == (s64)S32_MAX - * These relations are used in the conditions below. - */ - if (reg->s32_min_value >= 0 && reg->smin_value >= S32_MIN && reg->smax_value <= S32_MAX) { - reg->smin_value = reg->s32_min_value; - reg->smax_value = reg->s32_max_value; - reg->umin_value = reg->s32_min_value; - reg->umax_value = reg->s32_max_value; - reg->var_off = tnum_intersect(reg->var_off, - tnum_range(reg->smin_value, reg->smax_value)); - } + reg->r64 = cnum64_cnum32_intersect(reg->r64, reg->r32); } static void __reg_deduce_bounds(struct bpf_reg_state *reg) { - deduce_bounds_64_from_64(reg); deduce_bounds_32_from_64(reg); - deduce_bounds_32_from_32(reg); deduce_bounds_64_from_32(reg); } @@ -2336,11 +2059,11 @@ static void __reg_deduce_bounds(struct bpf_reg_state *reg) static void __reg_bound_offset(struct bpf_reg_state *reg) { struct tnum var64_off = tnum_intersect(reg->var_off, - tnum_range(reg->umin_value, - reg->umax_value)); + tnum_range(reg_umin(reg), + reg_umax(reg))); struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off), - tnum_range(reg->u32_min_value, - reg->u32_max_value)); + tnum_range(reg_u32_min(reg), + reg_u32_max(reg))); reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } @@ -2366,35 +2089,25 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) __update_reg_bounds(reg); } -static bool range_bounds_violation(struct bpf_reg_state *reg) -{ - return (reg->umin_value > reg->umax_value || reg->smin_value > reg->smax_value || - reg->u32_min_value > reg->u32_max_value || - reg->s32_min_value > reg->s32_max_value); -} - static bool const_tnum_range_mismatch(struct bpf_reg_state *reg) { - u64 uval = reg->var_off.value; - s64 sval = (s64)uval; - if (!tnum_is_const(reg->var_off)) return false; - return reg->umin_value != uval || reg->umax_value != uval || - reg->smin_value != sval || reg->smax_value != sval; + return !cnum64_is_const(reg->r64) || reg->r64.base != reg->var_off.value; } static bool const_tnum_range_mismatch_32(struct bpf_reg_state *reg) { - u32 uval32 = tnum_subreg(reg->var_off).value; - s32 sval32 = (s32)uval32; - if (!tnum_subreg_is_const(reg->var_off)) return false; - return reg->u32_min_value != uval32 || reg->u32_max_value != uval32 || - reg->s32_min_value != sval32 || reg->s32_max_value != sval32; + return !cnum32_is_const(reg->r32) || reg->r32.base != tnum_subreg(reg->var_off).value; +} + +static bool range_bounds_violation(struct bpf_reg_state *reg) +{ + return cnum32_is_empty(reg->r32) || cnum64_is_empty(reg->r64); } static int reg_bounds_sanity_check(struct bpf_verifier_env *env, @@ -2419,12 +2132,11 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env, return 0; out: - verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " - "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)", - ctx, msg, reg->umin_value, reg->umax_value, - reg->smin_value, reg->smax_value, - reg->u32_min_value, reg->u32_max_value, - reg->s32_min_value, reg->s32_max_value, + verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s r64={.base=%#llx, .size=%#llx} " + "r32={.base=%#x, .size=%#x} var_off=(%#llx, %#llx)", + ctx, msg, + reg->r64.base, reg->r64.size, + reg->r32.base, reg->r32.size, reg->var_off.value, reg->var_off.mask); if (env->test_reg_invariants) return -EFAULT; @@ -2432,44 +2144,15 @@ out: return 0; } -static bool __reg32_bound_s64(s32 a) -{ - return a >= 0 && a <= S32_MAX; -} - -static void __reg_assign_32_into_64(struct bpf_reg_state *reg) -{ - reg->umin_value = reg->u32_min_value; - reg->umax_value = reg->u32_max_value; - - /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must - * be positive otherwise set to worse case bounds and refine later - * from tnum. - */ - if (__reg32_bound_s64(reg->s32_min_value) && - __reg32_bound_s64(reg->s32_max_value)) { - reg->smin_value = reg->s32_min_value; - reg->smax_value = reg->s32_max_value; - } else { - reg->smin_value = 0; - reg->smax_value = U32_MAX; - } -} - /* Mark a register as having a completely unknown (scalar) value. */ void bpf_mark_reg_unknown_imprecise(struct bpf_reg_state *reg) { - /* - * Clear type, off, and union(map_ptr, range) and - * padding between 'type' and union - */ - memset(reg, 0, offsetof(struct bpf_reg_state, var_off)); + s32 subreg_def = reg->subreg_def; + + memset(reg, 0, sizeof(*reg)); reg->type = SCALAR_VALUE; - reg->id = 0; - reg->ref_obj_id = 0; reg->var_off = tnum_unknown; - reg->frameno = 0; - reg->precise = false; + reg->subreg_def = subreg_def; __mark_reg_unbounded(reg); } @@ -2497,11 +2180,12 @@ static int __mark_reg_s32_range(struct bpf_verifier_env *env, { struct bpf_reg_state *reg = regs + regno; - reg->s32_min_value = max_t(s32, reg->s32_min_value, s32_min); - reg->s32_max_value = min_t(s32, reg->s32_max_value, s32_max); - - reg->smin_value = max_t(s64, reg->smin_value, s32_min); - reg->smax_value = min_t(s64, reg->smax_value, s32_max); + reg_set_srange32(reg, + max_t(s32, reg_s32_min(reg), s32_min), + min_t(s32, reg_s32_max(reg), s32_max)); + reg_set_srange64(reg, + max_t(s64, reg_smin(reg), s32_min), + min_t(s64, reg_smax(reg), s32_max)); reg_bounds_sync(reg); @@ -3296,50 +2980,13 @@ out: return ret; } -static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int spi, int nr_slots) +static void mark_stack_slots_scratched(struct bpf_verifier_env *env, + int spi, int nr_slots) { int i; for (i = 0; i < nr_slots; i++) mark_stack_slot_scratched(env, spi - i); - return 0; -} - -static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - int spi; - - /* For CONST_PTR_TO_DYNPTR, it must have already been done by - * check_reg_arg in check_helper_call and mark_btf_func_reg_size in - * check_kfunc_call. - */ - if (reg->type == CONST_PTR_TO_DYNPTR) - return 0; - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; - /* Caller ensures dynptr is valid and initialized, which means spi is in - * bounds and spi is the first dynptr slot. Simply mark stack slot as - * read. - */ - return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS); -} - -static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int spi, int nr_slots) -{ - return mark_stack_slot_obj_read(env, reg, spi, nr_slots); -} - -static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - int spi; - - spi = irq_flag_get_spi(env, reg); - if (spi < 0) - return spi; - return mark_stack_slot_obj_read(env, reg, spi, 1); } /* This function is supposed to be used by the following 32-bit optimization @@ -3492,17 +3139,12 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return __check_reg_arg(env, state->regs, regno, t); } -static int insn_stack_access_flags(int frameno, int spi) -{ - return INSN_F_STACK_ACCESS | (spi << INSN_F_SPI_SHIFT) | frameno; -} - static void mark_indirect_target(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].indirect_target = true; } -#define LR_FRAMENO_BITS 3 +#define LR_FRAMENO_BITS 4 #define LR_SPI_BITS 6 #define LR_ENTRY_BITS (LR_SPI_BITS + LR_FRAMENO_BITS + 1) #define LR_SIZE_BITS 4 @@ -3511,7 +3153,11 @@ static void mark_indirect_target(struct bpf_verifier_env *env, int idx) #define LR_SIZE_MASK ((1ull << LR_SIZE_BITS) - 1) #define LR_SPI_OFF LR_FRAMENO_BITS #define LR_IS_REG_OFF (LR_SPI_BITS + LR_FRAMENO_BITS) -#define LINKED_REGS_MAX 6 +#define LINKED_REGS_MAX 5 + +static_assert(MAX_CALL_FRAMES <= (1 << LR_FRAMENO_BITS)); +static_assert(LINKED_REGS_MAX < (1 << LR_SIZE_BITS)); +static_assert(LINKED_REGS_MAX * LR_ENTRY_BITS + LR_SIZE_BITS <= 64); struct linked_reg { u8 frameno; @@ -3535,10 +3181,11 @@ static struct linked_reg *linked_regs_push(struct linked_regs *s) return NULL; } -/* Use u64 as a vector of 6 10-bit values, use first 4-bits to track +/* + * Use u64 as a vector of 5 11-bit values, use first 4-bits to track * number of elements currently in stack. - * Pack one history entry for linked registers as 10 bits in the following format: - * - 3-bits frameno + * Pack one history entry for linked registers as 11 bits in the following format: + * - 4-bits frameno * - 6-bits spi_or_reg * - 1-bit is_reg */ @@ -3734,12 +3381,6 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, src_reg->id = ++env->id_gen; } -/* Copy src state preserving dst->parent and dst->live fields */ -static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) -{ - *dst = *src; -} - static void save_register_state(struct bpf_verifier_env *env, struct bpf_func_state *state, int spi, struct bpf_reg_state *reg, @@ -3747,7 +3388,7 @@ static void save_register_state(struct bpf_verifier_env *env, { int i; - copy_register_state(&state->stack[spi].spilled_ptr, reg); + state->stack[spi].spilled_ptr = *reg; for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--) state->stack[spi].slot_type[i - 1] = STACK_SPILL; @@ -3764,7 +3405,7 @@ static bool is_bpf_st_mem(struct bpf_insn *insn) static int get_reg_width(struct bpf_reg_state *reg) { - return fls64(reg->umax_value); + return fls64(reg_umax(reg)); } /* See comment for mark_fastcall_pattern_for_call() */ @@ -3817,7 +3458,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; struct bpf_reg_state *reg = NULL; - int insn_flags = insn_stack_access_flags(state->frameno, spi); + int insn_flags = INSN_F_STACK_ACCESS; + int hist_spi = spi, hist_frame = state->frameno; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits @@ -3913,11 +3555,12 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0); + return bpf_push_jmp_history(env, env->cur_state, insn_flags, + hist_spi, hist_frame, 0); return 0; } -/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is +/* Write the stack: 'stack[ptr_reg + off] = value_regno'. 'ptr_reg' is * known to contain a variable offset. * This function checks whether the write is permitted and conservatively * tracks the effects of the write, considering that each stack slot in the @@ -3938,13 +3581,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, static int check_stack_write_var_off(struct bpf_verifier_env *env, /* func where register points to */ struct bpf_func_state *state, - int ptr_regno, int off, int size, + struct bpf_reg_state *ptr_reg, int off, int size, int value_regno, int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int min_off, max_off; int i, err; - struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; + struct bpf_reg_state *value_reg = NULL; struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; bool writing_zero = false; /* set if the fact that we're writing a zero is used to let any @@ -3953,9 +3596,8 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, bool zero_used = false; cur = env->cur_state->frame[env->cur_state->curframe]; - ptr_reg = &cur->regs[ptr_regno]; - min_off = ptr_reg->smin_value + off; - max_off = ptr_reg->smax_value + off + size; + min_off = reg_smin(ptr_reg) + off; + max_off = reg_smax(ptr_reg) + off + size; if (value_regno >= 0) value_reg = &cur->regs[value_regno]; if ((value_reg && bpf_register_is_null(value_reg)) || @@ -4110,7 +3752,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; struct bpf_reg_state *reg; u8 *stype, type; - int insn_flags = insn_stack_access_flags(reg_state->frameno, spi); + int insn_flags = INSN_F_STACK_ACCESS; + int hist_spi = spi, hist_frame = reg_state->frameno; stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; @@ -4147,7 +3790,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * with the destination register on fill. */ assign_scalar_id_before_mov(env, reg); - copy_register_state(&state->regs[dst_regno], reg); + state->regs[dst_regno] = *reg; state->regs[dst_regno].subreg_def = subreg_def; /* Break the relation on a narrowing fill. @@ -4202,7 +3845,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * with the destination register on fill. */ assign_scalar_id_before_mov(env, reg); - copy_register_state(&state->regs[dst_regno], reg); + state->regs[dst_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions @@ -4241,7 +3884,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return bpf_push_jmp_history(env, env->cur_state, insn_flags, 0); + return bpf_push_jmp_history(env, env->cur_state, insn_flags, + hist_spi, hist_frame, 0); return 0; } @@ -4250,8 +3894,8 @@ enum bpf_access_src { ACCESS_HELPER = 2, /* the access is performed by a helper */ }; -static int check_stack_range_initialized(struct bpf_verifier_env *env, - int regno, int off, int access_size, +static int check_stack_range_initialized(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + argno_t argno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, struct bpf_call_arg_meta *meta); @@ -4261,37 +3905,35 @@ static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) return cur_regs(env) + regno; } -/* Read the stack at 'ptr_regno + off' and put the result into the register +/* Read the stack at 'reg + off' and put the result into the register * 'dst_regno'. - * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'), + * 'off' includes the pointer register's fixed offset(i.e. 'reg->off'), * but not its variable offset. * 'size' is assumed to be <= reg size and the access is assumed to be aligned. * * As opposed to check_stack_read_fixed_off, this function doesn't deal with * filling registers (i.e. reads of spilled register cannot be detected when * the offset is not fixed). We conservatively mark 'dst_regno' as containing - * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable + * SCALAR_VALUE. That's why we assert that the 'reg' has a variable * offset; for a fixed offset check_stack_read_fixed_off should be used * instead. */ -static int check_stack_read_var_off(struct bpf_verifier_env *env, - int ptr_regno, int off, int size, int dst_regno) +static int check_stack_read_var_off(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + argno_t ptr_argno, int off, int size, int dst_regno) { - /* The state of the source register. */ - struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *ptr_state = bpf_func(env, reg); int err; int min_off, max_off; /* Note that we pass a NULL meta, so raw access will not be permitted. */ - err = check_stack_range_initialized(env, ptr_regno, off, size, + err = check_stack_range_initialized(env, reg, ptr_argno, off, size, false, BPF_READ, NULL); if (err) return err; - min_off = reg->smin_value + off; - max_off = reg->smax_value + off; + min_off = reg_smin(reg) + off; + max_off = reg_smax(reg) + off; mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); check_fastcall_stack_contract(env, ptr_state, env->insn_idx, min_off); return 0; @@ -4307,10 +3949,9 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, * can be -1, meaning that the read value is not going to a register. */ static int check_stack_read(struct bpf_verifier_env *env, - int ptr_regno, int off, int size, + struct bpf_reg_state *reg, argno_t ptr_argno, int off, int size, int dst_regno) { - struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *state = bpf_func(env, reg); int err; /* Some accesses are only permitted with a static offset. */ @@ -4346,7 +3987,7 @@ static int check_stack_read(struct bpf_verifier_env *env, * than fixed offset ones. Note that dst_regno >= 0 on this * branch. */ - err = check_stack_read_var_off(env, ptr_regno, off, size, + err = check_stack_read_var_off(env, reg, ptr_argno, off, size, dst_regno); } return err; @@ -4356,17 +3997,16 @@ static int check_stack_read(struct bpf_verifier_env *env, /* check_stack_write dispatches to check_stack_write_fixed_off or * check_stack_write_var_off. * - * 'ptr_regno' is the register used as a pointer into the stack. + * 'reg' is the register used as a pointer into the stack. * 'value_regno' is the register whose value we're writing to the stack. It can * be -1, meaning that we're not writing from a register. * * The caller must ensure that the offset falls within the maximum stack size. */ static int check_stack_write(struct bpf_verifier_env *env, - int ptr_regno, int off, int size, + struct bpf_reg_state *reg, int off, int size, int value_regno, int insn_idx) { - struct bpf_reg_state *reg = reg_state(env, ptr_regno); struct bpf_func_state *state = bpf_func(env, reg); int err; @@ -4379,28 +4019,135 @@ static int check_stack_write(struct bpf_verifier_env *env, * than fixed offset ones. */ err = check_stack_write_var_off(env, state, - ptr_regno, off, size, + reg, off, size, value_regno, insn_idx); } return err; } -static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, +/* + * Write a value to the outgoing stack arg area. + * off is a negative offset from r11 (e.g. -8 for arg6, -16 for arg7). + */ +static int check_stack_arg_write(struct bpf_verifier_env *env, struct bpf_func_state *state, + int off, struct bpf_reg_state *value_reg) +{ + int max_stack_arg_regs = MAX_BPF_FUNC_ARGS - MAX_BPF_FUNC_REG_ARGS; + struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; + int spi = -off / BPF_REG_SIZE - 1; + struct bpf_reg_state *arg; + int err; + + if (spi >= max_stack_arg_regs) { + verbose(env, "stack arg write offset %d exceeds max %d stack args\n", + off, max_stack_arg_regs); + return -EINVAL; + } + + err = grow_stack_arg_slots(env, state, spi + 1); + if (err) + return err; + + /* Track the max outgoing stack arg slot count. */ + if (spi + 1 > subprog->max_out_stack_arg_cnt) + subprog->max_out_stack_arg_cnt = spi + 1; + + if (value_reg) { + state->stack_arg_regs[spi] = *value_reg; + } else { + /* BPF_ST: store immediate, treat as scalar */ + arg = &state->stack_arg_regs[spi]; + arg->type = SCALAR_VALUE; + __mark_reg_known(arg, env->prog->insnsi[env->insn_idx].imm); + } + state->no_stack_arg_load = true; + return bpf_push_jmp_history(env, env->cur_state, + INSN_F_STACK_ARG_ACCESS, spi, 0, 0); +} + +/* + * Read a value from the incoming stack arg area. + * off is a positive offset from r11 (e.g. +8 for arg6, +16 for arg7). + */ +static int check_stack_arg_read(struct bpf_verifier_env *env, struct bpf_func_state *state, + int off, int dst_regno) +{ + struct bpf_subprog_info *subprog = &env->subprog_info[state->subprogno]; + struct bpf_verifier_state *vstate = env->cur_state; + int spi = off / BPF_REG_SIZE - 1; + struct bpf_func_state *caller, *cur; + struct bpf_reg_state *arg; + + if (state->no_stack_arg_load) { + verbose(env, "r11 load must be before any r11 store or call insn\n"); + return -EINVAL; + } + + if (spi + 1 > bpf_in_stack_arg_cnt(subprog)) { + verbose(env, "invalid read from stack arg off %d depth %d\n", + off, bpf_in_stack_arg_cnt(subprog) * BPF_REG_SIZE); + return -EACCES; + } + + caller = vstate->frame[vstate->curframe - 1]; + arg = &caller->stack_arg_regs[spi]; + cur = vstate->frame[vstate->curframe]; + cur->regs[dst_regno] = *arg; + return bpf_push_jmp_history(env, env->cur_state, + INSN_F_STACK_ARG_ACCESS, spi, 0, 0); +} + +static int mark_stack_arg_precision(struct bpf_verifier_env *env, int arg_idx) +{ + struct bpf_func_state *caller = cur_func(env); + int spi = arg_idx - MAX_BPF_FUNC_REG_ARGS; + + bt_set_frame_stack_arg_slot(&env->bt, caller->frameno, spi); + return mark_chain_precision_batch(env, env->cur_state); +} + +static int check_outgoing_stack_args(struct bpf_verifier_env *env, struct bpf_func_state *caller, + int nargs) +{ + int i, spi; + + for (i = MAX_BPF_FUNC_REG_ARGS; i < nargs; i++) { + spi = i - MAX_BPF_FUNC_REG_ARGS; + if (spi >= caller->out_stack_arg_cnt || + caller->stack_arg_regs[spi].type == NOT_INIT) { + verbose(env, "callee expects %d args, stack arg%d is not initialized\n", + nargs, spi + 1); + return -EFAULT; + } + } + + return 0; +} + +static struct bpf_reg_state *get_func_arg_reg(struct bpf_func_state *caller, + struct bpf_reg_state *regs, int arg) +{ + if (arg < MAX_BPF_FUNC_REG_ARGS) + return ®s[arg + 1]; + + return &caller->stack_arg_regs[arg - MAX_BPF_FUNC_REG_ARGS]; +} + +static int check_map_access_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int off, int size, enum bpf_access_type type) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_map *map = reg->map_ptr; u32 cap = bpf_map_flags_to_cap(map); if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n", - map->value_size, reg->smin_value + off, size); + map->value_size, reg_smin(reg) + off, size); return -EACCES; } if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n", - map->value_size, reg->smin_value + off, size); + map->value_size, reg_smin(reg) + off, size); return -EACCES; } @@ -4408,17 +4155,15 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, } /* check read/write into memory region (e.g., map value, ringbuf sample, etc) */ -static int __check_mem_access(struct bpf_verifier_env *env, int regno, +static int __check_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, u32 mem_size, bool zero_size_allowed) { bool size_ok = size > 0 || (size == 0 && zero_size_allowed); - struct bpf_reg_state *reg; if (off >= 0 && size_ok && (u64)off + size <= mem_size) return 0; - reg = &cur_regs(env)[regno]; switch (reg->type) { case PTR_TO_MAP_KEY: verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n", @@ -4431,8 +4176,8 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: - verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", - off, size, regno, reg->id, off, mem_size); + verbose(env, "invalid access to packet, off=%d size=%d, %s(id=%d,off=%d,r=%d)\n", + off, size, reg_arg_name(env, argno), reg->id, off, mem_size); break; case PTR_TO_CTX: verbose(env, "invalid access to context, ctx_size=%d off=%d size=%d\n", @@ -4448,13 +4193,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, } /* check read/write into a memory region with possible variable offset */ -static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, +static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, u32 mem_size, bool zero_size_allowed) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg = &state->regs[regno]; int err; /* We may have adjusted the register pointing to memory region, so we @@ -4467,36 +4209,36 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if (reg->smin_value < 0 && - (reg->smin_value == S64_MIN || - (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || - reg->smin_value + off < 0)) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); + if (reg_smin(reg) < 0 && + (reg_smin(reg) == S64_MIN || + (off + reg_smin(reg) != (s64)(s32)(off + reg_smin(reg))) || + reg_smin(reg) + off < 0)) { + verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n", + reg_arg_name(env, argno)); return -EACCES; } - err = __check_mem_access(env, regno, reg->smin_value + off, size, + err = __check_mem_access(env, reg, argno, reg_smin(reg) + off, size, mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d min value is outside of the allowed memory range\n", - regno); + verbose(env, "%s min value is outside of the allowed memory range\n", + reg_arg_name(env, argno)); return err; } /* If we haven't set a max value then we need to bail since we can't be * sure we won't do bad things. - * If reg->umax_value + off could overflow, treat that as unbounded too. + * If reg_umax(reg) + off could overflow, treat that as unbounded too. */ - if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n", - regno); + if (reg_umax(reg) >= BPF_MAX_VAR_OFF) { + verbose(env, "%s unbounded memory access, make sure to bounds check any such access\n", + reg_arg_name(env, argno)); return -EACCES; } - err = __check_mem_access(env, regno, reg->umax_value + off, size, + err = __check_mem_access(env, reg, argno, reg_umax(reg) + off, size, mem_size, zero_size_allowed); if (err) { - verbose(env, "R%d max value is outside of the allowed memory range\n", - regno); + verbose(env, "%s max value is outside of the allowed memory range\n", + reg_arg_name(env, argno)); return err; } @@ -4504,7 +4246,7 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, } static int __check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, + const struct bpf_reg_state *reg, argno_t argno, bool fixed_off_ok) { /* Access to this pointer-typed register or passing it to a helper @@ -4520,15 +4262,15 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, return -EACCES; } - if (reg->smin_value < 0) { - verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n", - reg_type_str(env, reg->type), regno, reg->var_off.value); + if (reg_smin(reg) < 0) { + verbose(env, "negative offset %s ptr %s off=%lld disallowed\n", + reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value); return -EACCES; } if (!fixed_off_ok && reg->var_off.value != 0) { - verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n", - reg_type_str(env, reg->type), regno, reg->var_off.value); + verbose(env, "dereference of modified %s ptr %s off=%lld disallowed\n", + reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value); return -EACCES; } @@ -4538,7 +4280,7 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, static int check_ptr_off_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno) { - return __check_ptr_off_reg(env, reg, regno, false); + return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false); } static int map_kptr_match_type(struct bpf_verifier_env *env, @@ -4574,9 +4316,9 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the * normal store of unreferenced kptr, we must ensure var_off is zero. * Since ref_ptr cannot be accessed directly by BPF insns, check for - * reg->ref_obj_id is not needed here. + * reg->id is not needed here. */ - if (__check_ptr_off_reg(env, reg, regno, true)) + if (__check_ptr_off_reg(env, reg, argno_from_reg(regno), true)) return -EACCES; /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and @@ -4719,7 +4461,7 @@ static int mark_uptr_ld_reg(struct bpf_verifier_env *env, u32 regno, return 0; } -static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, +static int check_map_kptr_access(struct bpf_verifier_env *env, int value_regno, int insn_idx, struct btf_field *kptr_field) { @@ -4796,19 +4538,16 @@ static u32 map_mem_size(const struct bpf_map *map) } /* check read/write into a map element with possible variable offset */ -static int check_map_access(struct bpf_verifier_env *env, u32 regno, +static int check_map_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, bool zero_size_allowed, enum bpf_access_src src) { - struct bpf_verifier_state *vstate = env->cur_state; - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; u32 mem_size = map_mem_size(map); struct btf_record *rec; int err, i; - err = check_mem_region_access(env, regno, off, size, mem_size, zero_size_allowed); + err = check_mem_region_access(env, reg, argno, off, size, mem_size, zero_size_allowed); if (err) return err; @@ -4823,8 +4562,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * this program. To check that [x1, x2) overlaps with [y1, y2), * it is sufficient to check x1 < y2 && y1 < x2. */ - if (reg->smin_value + off < p + field->size && - p < reg->umax_value + off + size) { + if (reg_smin(reg) + off < p + field->size && + p < reg_umax(reg) + off + size) { switch (field->type) { case BPF_KPTR_UNREF: case BPF_KPTR_REF: @@ -4904,30 +4643,29 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } } -static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, +static int check_packet_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int size, bool zero_size_allowed) { - struct bpf_reg_state *reg = reg_state(env, regno); int err; if (reg->range < 0) { - verbose(env, "R%d offset is outside of the packet\n", regno); + verbose(env, "%s offset is outside of the packet\n", reg_arg_name(env, argno)); return -EINVAL; } - err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed); + err = check_mem_region_access(env, reg, argno, off, size, reg->range, zero_size_allowed); if (err) return err; /* __check_mem_access has made sure "off + size - 1" is within u16. - * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, + * reg_umax(reg) can't be bigger than MAX_PACKET_OFF which is 0xffff, * otherwise find_good_pkt_pointers would have refused to set range info * that __check_mem_access would have rejected this pkt access. - * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. + * Therefore, "off + reg_umax(reg) + size - 1" won't overflow u32. */ env->prog->aux->max_pkt_offset = max_t(u32, env->prog->aux->max_pkt_offset, - off + reg->umax_value + size - 1); + off + reg_umax(reg) + size - 1); return 0; } @@ -4951,8 +4689,8 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of * type of narrower access. */ if (base_type(info->reg_type) == PTR_TO_BTF_ID) { - if (info->ref_obj_id && - !find_reference_state(env->cur_state, info->ref_obj_id)) { + if (info->ref_id && + !find_reference_state(env->cur_state, info->ref_id)) { verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n", off); return -EACCES; @@ -4970,7 +4708,7 @@ static int __check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int of return -EACCES; } -static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, +static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, argno_t argno, int off, int access_size, enum bpf_access_type t, struct bpf_insn_access_aux *info) { @@ -4980,17 +4718,15 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regn */ bool var_off_ok = is_var_ctx_off_allowed(env->prog); bool fixed_off_ok = !env->ops->convert_ctx_access; - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = regs + regno; int err; if (var_off_ok) - err = check_mem_region_access(env, regno, off, access_size, U16_MAX, false); + err = check_mem_region_access(env, reg, argno, off, access_size, U16_MAX, false); else - err = __check_ptr_off_reg(env, reg, regno, fixed_off_ok); + err = __check_ptr_off_reg(env, reg, argno, fixed_off_ok); if (err) return err; - off += reg->umax_value; + off += reg_umax(reg); err = __check_ctx_access(env, insn_idx, off, access_size, t, info); if (err) @@ -4998,9 +4734,21 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; } -static int check_flow_keys_access(struct bpf_verifier_env *env, int off, - int size) +static int check_flow_keys_access(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, argno_t argno, + int off, int size) { + /* Only a constant offset is allowed here; fold it into off. */ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "%s invalid variable offset to flow keys: off=%d, var_off=%s\n", + reg_arg_name(env, argno), off, tn_buf); + return -EACCES; + } + off += reg->var_off.value; + if (size < 0 || off < 0 || (u64)off + size > sizeof(struct bpf_flow_keys)) { verbose(env, "invalid access to flow keys off=%d size=%d\n", @@ -5011,16 +4759,15 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, } static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, - u32 regno, int off, int size, + struct bpf_reg_state *reg, argno_t argno, int off, int size, enum bpf_access_type t) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_insn_access_aux info = {}; bool valid; - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); + if (reg_smin(reg) < 0) { + verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n", + reg_arg_name(env, argno)); return -EACCES; } @@ -5048,8 +4795,8 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return 0; } - verbose(env, "R%d invalid %s access off=%d size=%d\n", - regno, reg_type_str(env, reg->type), off, size); + verbose(env, "%s invalid %s access off=%d size=%d\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type), off, size); return -EACCES; } @@ -5124,10 +4871,10 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { [CONST_PTR_TO_MAP] = btf_bpf_map_id, }; -static bool is_trusted_reg(const struct bpf_reg_state *reg) +static bool is_trusted_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg) { /* A referenced register is always trusted. */ - if (reg->ref_obj_id) + if (reg_is_referenced(env, reg)) return true; /* Types listed in the reg2btf_ids are always trusted */ @@ -5369,7 +5116,10 @@ process_func: } subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth); - if (priv_stack_supported) { + if (IS_ENABLED(CONFIG_X86_64) && subprog[idx].stack_arg_cnt) { + /* x86-64 uses R9 for both private stack frame pointer and arg6. */ + subprog[idx].priv_stack_mode = NO_PRIV_STACK; + } else if (priv_stack_supported) { /* Request private stack support only if the subprog stack * depth is no less than BPF_PRIV_STACK_MIN_SIZE. This is to * avoid jit penalty if the stack usage is small. @@ -5380,6 +5130,8 @@ process_func: } if (subprog[idx].priv_stack_mode == PRIV_STACK_ADAPTIVE) { + if (subprog_depth > env->max_stack_depth) + env->max_stack_depth = subprog_depth; if (subprog_depth > MAX_BPF_STACK) { verbose(env, "stack size of subprog %d is %d. Too large\n", idx, subprog_depth); @@ -5387,6 +5139,8 @@ process_func: } } else { depth += subprog_depth; + if (depth > env->max_stack_depth) + env->max_stack_depth = depth; if (depth > MAX_BPF_STACK) { total = 0; for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) @@ -5405,7 +5159,7 @@ continue_func: if (bpf_pseudo_kfunc_call(insn + i) && !insn[i].off) { bool err = false; - if (!is_bpf_throw_kfunc(insn + i)) + if (!bpf_is_throw_kfunc(insn + i)) continue; for (tmp = idx; tmp >= 0 && !err; tmp = dinfo[tmp].caller) { if (subprog[tmp].is_cb) { @@ -5473,14 +5227,23 @@ continue_func: * this info will be utilized by JIT so that we will be preserving the * tail call counter throughout bpf2bpf calls combined with tailcalls */ - if (tail_call_reachable) + if (tail_call_reachable) { for (tmp = idx; tmp >= 0; tmp = dinfo[tmp].caller) { if (subprog[tmp].is_exception_cb) { verbose(env, "cannot tail call within exception cb\n"); return -EINVAL; } + if (subprog[tmp].stack_arg_cnt) { + verbose(env, "tail_calls are not allowed in programs with stack args\n"); + return -EINVAL; + } subprog[tmp].tail_call_reachable = true; } + } else if (!idx && subprog[0].has_tail_call && subprog[0].stack_arg_cnt) { + verbose(env, "tail_calls are not allowed in programs with stack args\n"); + return -EINVAL; + } + if (subprog[0].tail_call_reachable) env->prog->aux->tail_call_reachable = true; @@ -5499,6 +5262,9 @@ continue_func: frame = dinfo[idx].frame; i = dinfo[idx].ret_insn; + /* reset tail_call_reachable to the parent's actual state */ + tail_call_reachable = subprog[idx].tail_call_reachable; + goto continue_func; } @@ -5559,12 +5325,12 @@ static int check_max_stack_depth(struct bpf_verifier_env *env) static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, - int regno, int off, int size) + argno_t argno, int off, int size) { if (off < 0) { verbose(env, - "R%d invalid %s buffer access: off=%d, size=%d\n", - regno, buf_info, off, size); + "%s invalid %s buffer access: off=%d, size=%d\n", + reg_arg_name(env, argno), buf_info, off, size); return -EACCES; } if (!tnum_is_const(reg->var_off)) { @@ -5572,8 +5338,8 @@ static int __check_buffer_access(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "R%d invalid variable buffer offset: off=%d, var_off=%s\n", - regno, off, tn_buf); + "%s invalid variable buffer offset: off=%d, var_off=%s\n", + reg_arg_name(env, argno), off, tn_buf); return -EACCES; } @@ -5582,11 +5348,11 @@ static int __check_buffer_access(struct bpf_verifier_env *env, static int check_tp_buffer_access(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, - int regno, int off, int size) + argno_t argno, int off, int size) { int err; - err = __check_buffer_access(env, "tracepoint", reg, regno, off, size); + err = __check_buffer_access(env, "tracepoint", reg, argno, off, size); if (err) return err; @@ -5598,14 +5364,14 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, static int check_buffer_access(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, - int regno, int off, int size, + argno_t argno, int off, int size, bool zero_size_allowed, u32 *max_access) { const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr"; int err; - err = __check_buffer_access(env, buf_info, reg, regno, off, size); + err = __check_buffer_access(env, buf_info, reg, argno, off, size); if (err) return err; @@ -5618,7 +5384,7 @@ static int check_buffer_access(struct bpf_verifier_env *env, static void zext_32_to_64(struct bpf_reg_state *reg) { reg->var_off = tnum_subreg(reg->var_off); - __reg_assign_32_into_64(reg); + reg_set_urange64(reg, reg_u32_min(reg), reg_u32_max(reg)); } /* truncate register to smaller size (in bytes) @@ -5633,15 +5399,10 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) /* fix arithmetic bounds */ mask = ((u64)1 << (size * 8)) - 1; - if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) { - reg->umin_value &= mask; - reg->umax_value &= mask; - } else { - reg->umin_value = 0; - reg->umax_value = mask; - } - reg->smin_value = reg->umin_value; - reg->smax_value = reg->umax_value; + if ((reg_umin(reg) & ~mask) == (reg_umax(reg) & ~mask)) + reg_set_urange64(reg, reg_umin(reg) & mask, reg_umax(reg) & mask); + else + reg_set_urange64(reg, 0, mask); /* If size is smaller than 32bit register the 32bit register * values are also truncated so we push 64-bit bounds into @@ -5656,19 +5417,16 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) static void set_sext64_default_val(struct bpf_reg_state *reg, int size) { if (size == 1) { - reg->smin_value = reg->s32_min_value = S8_MIN; - reg->smax_value = reg->s32_max_value = S8_MAX; + reg_set_srange64(reg, S8_MIN, S8_MAX); + reg_set_srange32(reg, S8_MIN, S8_MAX); } else if (size == 2) { - reg->smin_value = reg->s32_min_value = S16_MIN; - reg->smax_value = reg->s32_max_value = S16_MAX; + reg_set_srange64(reg, S16_MIN, S16_MAX); + reg_set_srange32(reg, S16_MIN, S16_MAX); } else { /* size == 4 */ - reg->smin_value = reg->s32_min_value = S32_MIN; - reg->smax_value = reg->s32_max_value = S32_MAX; + reg_set_srange64(reg, S32_MIN, S32_MAX); + reg_set_srange32(reg, S32_MIN, S32_MAX); } - reg->umin_value = reg->u32_min_value = 0; - reg->umax_value = U64_MAX; - reg->u32_max_value = U32_MAX; reg->var_off = tnum_unknown; } @@ -5689,29 +5447,27 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) reg->var_off = tnum_const((s32)u64_cval); u64_cval = reg->var_off.value; - reg->smax_value = reg->smin_value = u64_cval; - reg->umax_value = reg->umin_value = u64_cval; - reg->s32_max_value = reg->s32_min_value = u64_cval; - reg->u32_max_value = reg->u32_min_value = u64_cval; + reg->r64 = cnum64_from_urange(u64_cval, u64_cval); + reg->r32 = cnum32_from_urange((u32)u64_cval, (u32)u64_cval); return; } - top_smax_value = ((u64)reg->smax_value >> num_bits) << num_bits; - top_smin_value = ((u64)reg->smin_value >> num_bits) << num_bits; + top_smax_value = ((u64)reg_smax(reg) >> num_bits) << num_bits; + top_smin_value = ((u64)reg_smin(reg) >> num_bits) << num_bits; if (top_smax_value != top_smin_value) goto out; /* find the s64_min and s64_min after sign extension */ if (size == 1) { - init_s64_max = (s8)reg->smax_value; - init_s64_min = (s8)reg->smin_value; + init_s64_max = (s8)reg_smax(reg); + init_s64_min = (s8)reg_smin(reg); } else if (size == 2) { - init_s64_max = (s16)reg->smax_value; - init_s64_min = (s16)reg->smin_value; + init_s64_max = (s16)reg_smax(reg); + init_s64_min = (s16)reg_smin(reg); } else { - init_s64_max = (s32)reg->smax_value; - init_s64_min = (s32)reg->smin_value; + init_s64_max = (s32)reg_smax(reg); + init_s64_min = (s32)reg_smin(reg); } s64_max = max(init_s64_max, init_s64_min); @@ -5719,10 +5475,8 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) /* both of s64_max/s64_min positive or negative */ if ((s64_max >= 0) == (s64_min >= 0)) { - reg->s32_min_value = reg->smin_value = s64_min; - reg->s32_max_value = reg->smax_value = s64_max; - reg->u32_min_value = reg->umin_value = s64_min; - reg->u32_max_value = reg->umax_value = s64_max; + reg_set_srange64(reg, s64_min, s64_max); + reg_set_srange32(reg, s64_min, s64_max); reg->var_off = tnum_range(s64_min, s64_max); return; } @@ -5733,16 +5487,11 @@ out: static void set_sext32_default_val(struct bpf_reg_state *reg, int size) { - if (size == 1) { - reg->s32_min_value = S8_MIN; - reg->s32_max_value = S8_MAX; - } else { + if (size == 1) + reg_set_srange32(reg, S8_MIN, S8_MAX); + else /* size == 2 */ - reg->s32_min_value = S16_MIN; - reg->s32_max_value = S16_MAX; - } - reg->u32_min_value = 0; - reg->u32_max_value = U32_MAX; + reg_set_srange32(reg, S16_MIN, S16_MAX); reg->var_off = tnum_subreg(tnum_unknown); } @@ -5760,34 +5509,30 @@ static void coerce_subreg_to_size_sx(struct bpf_reg_state *reg, int size) reg->var_off = tnum_const((s16)u32_val); u32_val = reg->var_off.value; - reg->s32_min_value = reg->s32_max_value = u32_val; - reg->u32_min_value = reg->u32_max_value = u32_val; + reg_set_srange32(reg, u32_val, u32_val); return; } - top_smax_value = ((u32)reg->s32_max_value >> num_bits) << num_bits; - top_smin_value = ((u32)reg->s32_min_value >> num_bits) << num_bits; + top_smax_value = ((u32)reg_s32_max(reg) >> num_bits) << num_bits; + top_smin_value = ((u32)reg_s32_min(reg) >> num_bits) << num_bits; if (top_smax_value != top_smin_value) goto out; /* find the s32_min and s32_min after sign extension */ if (size == 1) { - init_s32_max = (s8)reg->s32_max_value; - init_s32_min = (s8)reg->s32_min_value; + init_s32_max = (s8)reg_s32_max(reg); + init_s32_min = (s8)reg_s32_min(reg); } else { /* size == 2 */ - init_s32_max = (s16)reg->s32_max_value; - init_s32_min = (s16)reg->s32_min_value; + init_s32_max = (s16)reg_s32_max(reg); + init_s32_min = (s16)reg_s32_min(reg); } s32_max = max(init_s32_max, init_s32_min); s32_min = min(init_s32_max, init_s32_min); if ((s32_min >= 0) == (s32_max >= 0)) { - reg->s32_min_value = s32_min; - reg->s32_max_value = s32_max; - reg->u32_min_value = (u32)s32_min; - reg->u32_max_value = (u32)s32_max; + reg_set_srange32(reg, s32_min, s32_max); reg->var_off = tnum_subreg(tnum_range(s32_min, s32_max)); return; } @@ -5977,12 +5722,11 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, } static int check_ptr_to_btf_access(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, - int regno, int off, int size, + struct bpf_reg_state *regs, struct bpf_reg_state *reg, + argno_t argno, int off, int size, enum bpf_access_type atype, int value_regno) { - struct bpf_reg_state *reg = regs + regno; const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); const char *field_name = NULL; @@ -6008,8 +5752,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "R%d is ptr_%s invalid variable offset: off=%d, var_off=%s\n", - regno, tname, off, tn_buf); + "%s is ptr_%s invalid variable offset: off=%d, var_off=%s\n", + reg_arg_name(env, argno), tname, off, tn_buf); return -EACCES; } @@ -6017,22 +5761,22 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (off < 0) { verbose(env, - "R%d is ptr_%s invalid negative access: off=%d\n", - regno, tname, off); + "%s is ptr_%s invalid negative access: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } if (reg->type & MEM_USER) { verbose(env, - "R%d is ptr_%s access user memory: off=%d\n", - regno, tname, off); + "%s is ptr_%s access user memory: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } if (reg->type & MEM_PERCPU) { verbose(env, - "R%d is ptr_%s access percpu memory: off=%d\n", - regno, tname, off); + "%s is ptr_%s access percpu memory: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } @@ -6044,7 +5788,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, ret = env->ops->btf_struct_access(&env->log, reg, off, size); } else { /* Writes are permitted with default btf_struct_access for - * program allocated objects (which always have ref_obj_id > 0), + * program allocated objects (which always have id > 0), * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC. */ if (atype != BPF_READ && !type_is_ptr_alloc_obj(reg->type)) { @@ -6053,8 +5797,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && - !(reg->type & MEM_RCU) && !reg->ref_obj_id) { - verifier_bug(env, "ref_obj_id for allocated object must be non-zero"); + !(reg->type & MEM_RCU) && !reg_is_referenced(env, reg)) { + verifier_bug(env, "allocated object must have a referenced id"); return -EFAULT; } @@ -6073,7 +5817,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, */ flag = PTR_UNTRUSTED; - } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) { + } else if (is_trusted_reg(env, reg) || is_rcu_reg(reg)) { /* By default any pointer obtained from walking a trusted pointer is no * longer trusted, unless the field being accessed has explicitly been * marked as inheriting its parent's state of trust (either full or RCU). @@ -6134,12 +5878,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } static int check_ptr_to_map_access(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, - int regno, int off, int size, + struct bpf_reg_state *regs, struct bpf_reg_state *reg, + argno_t argno, int off, int size, enum bpf_access_type atype, int value_regno) { - struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; struct bpf_reg_state map_reg; enum bpf_type_flag flag = 0; @@ -6170,8 +5913,8 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, } if (off < 0) { - verbose(env, "R%d is %s invalid negative access: off=%d\n", - regno, tname, off); + verbose(env, "%s is %s invalid negative access: off=%d\n", + reg_arg_name(env, argno), tname, off); return -EACCES; } @@ -6228,11 +5971,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, * 'off' includes `regno->offset`, but not its dynamic part (if any). */ static int check_stack_access_within_bounds( - struct bpf_verifier_env *env, - int regno, int off, int access_size, + struct bpf_verifier_env *env, struct bpf_reg_state *reg, + argno_t argno, int off, int access_size, enum bpf_access_type type) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = bpf_func(env, reg); s64 min_off, max_off; int err; @@ -6247,14 +5989,14 @@ static int check_stack_access_within_bounds( min_off = (s64)reg->var_off.value + off; max_off = min_off + access_size; } else { - if (reg->smax_value >= BPF_MAX_VAR_OFF || - reg->smin_value <= -BPF_MAX_VAR_OFF) { - verbose(env, "invalid unbounded variable-offset%s stack R%d\n", - err_extra, regno); + if (reg_smax(reg) >= BPF_MAX_VAR_OFF || + reg_smin(reg) <= -BPF_MAX_VAR_OFF) { + verbose(env, "invalid unbounded variable-offset%s stack %s\n", + err_extra, reg_arg_name(env, argno)); return -EACCES; } - min_off = reg->smin_value + off; - max_off = reg->smax_value + off + access_size; + min_off = reg_smin(reg) + off; + max_off = reg_smax(reg) + off + access_size; } err = check_stack_slot_within_bounds(env, min_off, state, type); @@ -6268,14 +6010,14 @@ static int check_stack_access_within_bounds( if (err) { if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid%s stack R%d off=%lld size=%d\n", - err_extra, regno, min_off, access_size); + verbose(env, "invalid%s stack %s off=%lld size=%d\n", + err_extra, reg_arg_name(env, argno), min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable-offset%s stack R%d var_off=%s off=%d size=%d\n", - err_extra, regno, tn_buf, off, access_size); + verbose(env, "invalid variable-offset%s stack %s var_off=%s off=%d size=%d\n", + err_extra, reg_arg_name(env, argno), tn_buf, off, access_size); } return err; } @@ -6320,12 +6062,11 @@ static void add_scalar_to_reg(struct bpf_reg_state *dst_reg, s64 val) * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *reg, argno_t argno, int off, int bpf_size, enum bpf_access_type t, int value_regno, bool strict_alignment_once, bool is_ldsx) { struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = regs + regno; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -6338,11 +6079,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (reg->type == PTR_TO_MAP_KEY) { if (t == BPF_WRITE) { - verbose(env, "write to change key R%d not allowed\n", regno); + verbose(env, "write to change key %s not allowed\n", + reg_arg_name(env, argno)); return -EACCES; } - err = check_mem_region_access(env, regno, off, size, + err = check_mem_region_access(env, reg, argno, off, size, reg->map_ptr->key_size, false); if (err) return err; @@ -6356,17 +6098,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } - err = check_map_access_type(env, regno, off, size, t); + err = check_map_access_type(env, reg, off, size, t); if (err) return err; - err = check_map_access(env, regno, off, size, false, ACCESS_DIRECT); + err = check_map_access(env, reg, argno, off, size, false, ACCESS_DIRECT); if (err) return err; if (tnum_is_const(reg->var_off)) kptr_field = btf_record_find(reg->map_ptr->record, off + reg->var_off.value, BPF_KPTR | BPF_UPTR); if (kptr_field) { - err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field); + err = check_map_kptr_access(env, value_regno, insn_idx, kptr_field); } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; @@ -6394,7 +6136,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn size); return -EACCES; } - copy_register_state(®s[value_regno], reg); + regs[value_regno] = *reg; add_scalar_to_reg(®s[value_regno], off); regs[value_regno].type = PTR_TO_INSN; } else { @@ -6406,14 +6148,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED); if (type_may_be_null(reg->type)) { - verbose(env, "R%d invalid mem access '%s'\n", regno, + verbose(env, "%s invalid mem access '%s'\n", reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } if (t == BPF_WRITE && rdonly_mem) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } @@ -6428,7 +6170,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * instructions, hence no need to check bounds in that case. */ if (!rdonly_untrusted) - err = check_mem_region_access(env, regno, off, size, + err = check_mem_region_access(env, reg, argno, off, size, reg->mem_size, false); if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); @@ -6446,7 +6188,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ctx_access(env, insn_idx, regno, off, size, t, &info); + err = check_ctx_access(env, insn_idx, reg, argno, off, size, t, &info); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a * PTR_TO_PACKET[_META,_END]. In the latter @@ -6464,8 +6206,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (type_may_be_null(info.reg_type)) - regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the * insn. When the dst is PTR, it is for sure not @@ -6475,23 +6215,25 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (base_type(info.reg_type) == PTR_TO_BTF_ID) { regs[value_regno].btf = info.btf; regs[value_regno].btf_id = info.btf_id; - regs[value_regno].ref_obj_id = info.ref_obj_id; + regs[value_regno].id = info.ref_id; } + if (type_may_be_null(info.reg_type) && !regs[value_regno].id) + regs[value_regno].id = ++env->id_gen; } regs[value_regno].type = info.reg_type; } } else if (reg->type == PTR_TO_STACK) { /* Basic bounds checks. */ - err = check_stack_access_within_bounds(env, regno, off, size, t); + err = check_stack_access_within_bounds(env, reg, argno, off, size, t); if (err) return err; if (t == BPF_READ) - err = check_stack_read(env, regno, off, size, + err = check_stack_read(env, reg, argno, off, size, value_regno); else - err = check_stack_write(env, regno, off, size, + err = check_stack_write(env, reg, off, size, value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { @@ -6504,7 +6246,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); return -EACCES; } - err = check_packet_access(env, regno, off, size, false); + err = check_packet_access(env, reg, argno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_FLOW_KEYS) { @@ -6515,28 +6257,28 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_flow_keys_access(env, off, size); + err = check_flow_keys_access(env, reg, argno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } - err = check_sock_access(env, insn_idx, regno, off, size, t); + err = check_sock_access(env, insn_idx, reg, argno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_TP_BUFFER) { - err = check_tp_buffer_access(env, reg, regno, off, size); + err = check_tp_buffer_access(env, reg, argno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (base_type(reg->type) == PTR_TO_BTF_ID && !type_may_be_null(reg->type)) { - err = check_ptr_to_btf_access(env, regs, regno, off, size, t, + err = check_ptr_to_btf_access(env, regs, reg, argno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { - err = check_ptr_to_map_access(env, regs, regno, off, size, t, + err = check_ptr_to_map_access(env, regs, reg, argno, off, size, t, value_regno); } else if (base_type(reg->type) == PTR_TO_BUF && !type_may_be_null(reg->type)) { @@ -6545,8 +6287,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (rdonly_mem) { if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } max_access = &env->prog->aux->max_rdonly_access; @@ -6554,7 +6296,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn max_access = &env->prog->aux->max_rdwr_access; } - err = check_buffer_access(env, reg, regno, off, size, false, + err = check_buffer_access(env, reg, argno, off, size, false, max_access); if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) @@ -6563,7 +6305,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { - verbose(env, "R%d invalid mem access '%s'\n", regno, + verbose(env, "%s invalid mem access '%s'\n", reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } @@ -6586,10 +6328,20 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, bool strict_alignment_once, bool is_ldsx, bool allow_trust_mismatch, const char *ctx) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = cur_regs(env); enum bpf_reg_type src_reg_type; int err; + /* Handle stack arg read */ + if (is_stack_arg_ldx(insn)) { + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + return check_stack_arg_read(env, state, insn->off, insn->dst_reg); + } + /* check src operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -6605,7 +6357,7 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Check if (src_reg + off) is readable. The state of dst_reg will be * updated by this call. */ - err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, + err = check_mem_access(env, env->insn_idx, regs + insn->src_reg, argno_from_reg(insn->src_reg), insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, strict_alignment_once, is_ldsx); err = err ?: save_aux_ptr_type(env, src_reg_type, @@ -6618,10 +6370,20 @@ static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, bool strict_alignment_once) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = cur_regs(env); enum bpf_reg_type dst_reg_type; int err; + /* Handle stack arg write */ + if (is_stack_arg_stx(insn)) { + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + return check_stack_arg_write(env, state, insn->off, regs + insn->src_reg); + } + /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) @@ -6635,7 +6397,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, dst_reg_type = regs[insn->dst_reg].type; /* Check if (dst_reg + off) is writeable. */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, regs + insn->dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, strict_alignment_once, false); err = err ?: save_aux_ptr_type(env, dst_reg_type, false); @@ -6646,6 +6408,7 @@ static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, static int check_atomic_rmw(struct bpf_verifier_env *env, struct bpf_insn *insn) { + struct bpf_reg_state *dst_reg; int load_reg; int err; @@ -6707,13 +6470,15 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, load_reg = -1; } + dst_reg = cur_regs(env) + insn->dst_reg; + /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, env->insn_idx, insn->dst_reg, + err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_READ, load_reg, true, false); if (err) @@ -6725,7 +6490,7 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; @@ -6814,11 +6579,10 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) * read offsets are marked as read. */ static int check_stack_range_initialized( - struct bpf_verifier_env *env, int regno, int off, + struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int off, int access_size, bool zero_size_allowed, enum bpf_access_type type, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = bpf_func(env, reg); int err, min_off, max_off, i, j, slot, spi; /* Some accesses can write anything into the stack, others are @@ -6840,7 +6604,7 @@ static int check_stack_range_initialized( return -EACCES; } - err = check_stack_access_within_bounds(env, regno, off, access_size, type); + err = check_stack_access_within_bounds(env, reg, argno, off, access_size, type); if (err) return err; @@ -6857,8 +6621,8 @@ static int check_stack_range_initialized( char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", - regno, tn_buf); + verbose(env, "%s variable offset stack access prohibited for !root, var_off=%s\n", + reg_arg_name(env, argno), tn_buf); return -EACCES; } /* Only initialized buffer on stack is allowed to be accessed @@ -6870,8 +6634,8 @@ static int check_stack_range_initialized( if (meta && meta->raw_mode) meta = NULL; - min_off = reg->smin_value + off; - max_off = reg->smax_value + off; + min_off = reg_smin(reg) + off; + max_off = reg_smax(reg) + off; } if (meta && meta->raw_mode) { @@ -6901,7 +6665,7 @@ static int check_stack_range_initialized( } } meta->access_size = access_size; - meta->regno = regno; + meta->regno = reg_from_argno(argno); return 0; } @@ -6941,17 +6705,17 @@ static int check_stack_range_initialized( if (*stype == STACK_POISON) { if (allow_poison) goto mark; - verbose(env, "reading from stack R%d off %d+%d size %d, slot poisoned by dead code elimination\n", - regno, min_off, i - min_off, access_size); + verbose(env, "reading from stack %s off %d+%d size %d, slot poisoned by dead code elimination\n", + reg_arg_name(env, argno), min_off, i - min_off, access_size); } else if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid read from stack R%d off %d+%d size %d\n", - regno, min_off, i - min_off, access_size); + verbose(env, "invalid read from stack %s off %d+%d size %d\n", + reg_arg_name(env, argno), min_off, i - min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n", - regno, tn_buf, i - min_off, access_size); + verbose(env, "invalid read from stack %s var_off %s+%d size %d\n", + reg_arg_name(env, argno), tn_buf, i - min_off, access_size); } return -EACCES; mark: @@ -6960,48 +6724,48 @@ mark: return 0; } -static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, +static int check_helper_mem_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int access_size, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env); u32 *max_access; switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, regno, 0, access_size, + return check_packet_access(env, reg, argno, 0, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: if (access_type == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", regno, - reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } - return check_mem_region_access(env, regno, 0, access_size, + return check_mem_region_access(env, reg, argno, 0, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, 0, access_size, access_type)) + if (check_map_access_type(env, reg, 0, access_size, access_type)) return -EACCES; - return check_map_access(env, regno, 0, access_size, + return check_map_access(env, reg, argno, 0, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { if (access_type == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", regno, - reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } } - return check_mem_region_access(env, regno, 0, + return check_mem_region_access(env, reg, argno, 0, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { if (access_type == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", regno, - reg_type_str(env, reg->type)); + verbose(env, "%s cannot write into %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } @@ -7009,26 +6773,26 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } else { max_access = &env->prog->aux->max_rdwr_access; } - return check_buffer_access(env, reg, regno, 0, + return check_buffer_access(env, reg, argno, 0, access_size, zero_size_allowed, max_access); case PTR_TO_STACK: return check_stack_range_initialized( - env, - regno, 0, access_size, + env, reg, + argno, 0, access_size, zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: - return check_ptr_to_btf_access(env, regs, regno, 0, - access_size, BPF_READ, -1); + return check_ptr_to_btf_access(env, regs, reg, argno, 0, + access_size, access_type, -1); case PTR_TO_CTX: /* Only permit reading or writing syscall context using helper calls. */ if (is_var_ctx_off_allowed(env->prog)) { - int err = check_mem_region_access(env, regno, 0, access_size, U16_MAX, + int err = check_mem_region_access(env, reg, argno, 0, access_size, U16_MAX, zero_size_allowed); if (err) return err; - if (env->prog->aux->max_ctx_offset < reg->umax_value + access_size) - env->prog->aux->max_ctx_offset = reg->umax_value + access_size; + if (env->prog->aux->max_ctx_offset < reg_umax(reg) + access_size) + env->prog->aux->max_ctx_offset = reg_umax(reg) + access_size; return 0; } fallthrough; @@ -7038,7 +6802,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, bpf_register_is_null(reg)) return 0; - verbose(env, "R%d type=%s ", regno, + verbose(env, "%s type=%s ", reg_arg_name(env, argno), reg_type_str(env, reg->type)); verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK)); return -EACCES; @@ -7048,12 +6812,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, /* verify arguments to helpers or kfuncs consisting of a pointer and an access * size. * - * @regno is the register containing the access size. regno-1 is the register - * containing the pointer. + * @mem_reg contains the pointer, @size_reg contains the access size. */ static int check_mem_size_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, - enum bpf_access_type access_type, + struct bpf_reg_state *mem_reg, + struct bpf_reg_state *size_reg, argno_t mem_argno, + argno_t size_argno, enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { @@ -7067,42 +6831,48 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, * out. Only upper bounds can be learned because retval is an * int type and negative retvals are allowed. */ - meta->msize_max_value = reg->umax_value; + meta->msize_max_value = reg_umax(size_reg); /* The register is SCALAR_VALUE; the access check happens using * its boundaries. For unprivileged variable accesses, disable * raw mode so that the program is required to initialize all * the memory that the helper could just partially fill up. */ - if (!tnum_is_const(reg->var_off)) + if (!tnum_is_const(size_reg->var_off)) meta = NULL; - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", - regno); + if (reg_smin(size_reg) < 0) { + verbose(env, "%s min value is negative, either use unsigned or 'var &= const'\n", + reg_arg_name(env, size_argno)); return -EACCES; } - if (reg->umin_value == 0 && !zero_size_allowed) { - verbose(env, "R%d invalid zero-sized read: u64=[%lld,%lld]\n", - regno, reg->umin_value, reg->umax_value); + if (reg_umin(size_reg) == 0 && !zero_size_allowed) { + verbose(env, "%s invalid zero-sized read: u64=[%lld,%lld]\n", + reg_arg_name(env, size_argno), reg_umin(size_reg), reg_umax(size_reg)); return -EACCES; } - if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", - regno); + if (reg_umax(size_reg) >= BPF_MAX_VAR_SIZ) { + verbose(env, "%s unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + reg_arg_name(env, size_argno)); return -EACCES; } - err = check_helper_mem_access(env, regno - 1, reg->umax_value, + err = check_helper_mem_access(env, mem_reg, mem_argno, reg_umax(size_reg), access_type, zero_size_allowed, meta); - if (!err) - err = mark_chain_precision(env, regno); + if (!err) { + int regno = reg_from_argno(size_argno); + + if (regno >= 0) + err = mark_chain_precision(env, regno); + else + err = mark_stack_arg_precision(env, arg_idx_from_argno(size_argno)); + } return err; } static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno, u32 mem_size) + argno_t argno, u32 mem_size) { bool may_be_null = type_may_be_null(reg->type); struct bpf_reg_state saved_reg; @@ -7111,6 +6881,12 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg if (bpf_register_is_null(reg)) return 0; + if (mem_size > S32_MAX) { + verbose(env, "%s memory size %u is too large\n", + reg_arg_name(env, argno), mem_size); + return -EACCES; + } + /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. @@ -7122,8 +6898,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg int size = base_type(reg->type) == PTR_TO_STACK ? -(int)mem_size : mem_size; - err = check_helper_mem_access(env, regno, size, BPF_READ, true, NULL); - err = err ?: check_helper_mem_access(env, regno, size, BPF_WRITE, true, NULL); + err = check_helper_mem_access(env, reg, argno, size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, reg, argno, size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7131,17 +6907,14 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg return err; } -static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno) +static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *mem_reg, + struct bpf_reg_state *size_reg, argno_t mem_argno, argno_t size_argno) { - struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; bool may_be_null = type_may_be_null(mem_reg->type); struct bpf_reg_state saved_reg; struct bpf_call_arg_meta meta; int err; - WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); - memset(&meta, 0, sizeof(meta)); if (may_be_null) { @@ -7149,8 +6922,8 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta); - err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta); + err = check_mem_size_reg(env, mem_reg, size_reg, mem_argno, size_argno, BPF_READ, true, &meta); + err = err ?: check_mem_size_reg(env, mem_reg, size_reg, mem_argno, size_argno, BPF_WRITE, true, &meta); if (may_be_null) *mem_reg = saved_reg; @@ -7186,11 +6959,10 @@ enum { * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ -static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) +static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int flags) { bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; - struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); bool is_irq = flags & PROCESS_LOCK_IRQ; @@ -7203,8 +6975,8 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) if (!is_const) { verbose(env, - "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n", - regno, lock_str); + "%s doesn't have constant offset. %s_lock has to be at the constant offset\n", + reg_arg_name(env, argno), lock_str); return -EINVAL; } if (reg->type == PTR_TO_MAP_VALUE) { @@ -7303,11 +7075,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) } /* Check if @regno is a pointer to a specific field in a map value */ -static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, +static int check_map_field_pointer(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, enum btf_field_type field_type, struct bpf_map_desc *map_desc) { - struct bpf_reg_state *reg = reg_state(env, regno); bool is_const = tnum_is_const(reg->var_off); struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; @@ -7316,8 +7087,8 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, if (!is_const) { verbose(env, - "R%d doesn't have constant offset. %s has to be at the constant offset\n", - regno, struct_name); + "%s doesn't have constant offset. %s has to be at the constant offset\n", + reg_arg_name(env, argno), struct_name); return -EINVAL; } if (!map->btf) { @@ -7357,26 +7128,26 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, return 0; } -static int process_timer_func(struct bpf_verifier_env *env, int regno, +static int process_timer_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_map_desc *map) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) { verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); return -EOPNOTSUPP; } - return check_map_field_pointer(env, regno, BPF_TIMER, map); + return check_map_field_pointer(env, reg, argno, BPF_TIMER, map); } -static int process_timer_helper(struct bpf_verifier_env *env, int regno, +static int process_timer_helper(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_call_arg_meta *meta) { - return process_timer_func(env, regno, &meta->map); + return process_timer_func(env, reg, argno, &meta->map); } -static int process_timer_kfunc(struct bpf_verifier_env *env, int regno, +static int process_timer_kfunc(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return process_timer_func(env, regno, &meta->map); + return process_timer_func(env, reg, argno, &meta->map); } static int process_kptr_func(struct bpf_verifier_env *env, int regno, @@ -7427,52 +7198,42 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return 0; } -/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK +/* + * Validate dynptr arguments for helper, kfunc and subprog. + * + * @dynptr is both input and output. It is populated when the argument is + * tagged with MEM_UNINIT (i.e., the dynptr argument that will be constructed) + * and consumed when the argument is expecting to be an initialized dynptr. + * @parent_id is used to track the referenced parent object (e.g., file or skb in + * qdisc program) when constructing a dynptr. + * + * There are two register types representing a bpf_dynptr, one is PTR_TO_STACK * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR. * * In both cases we deal with the first 8 bytes, but need to mark the next 8 * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object. * - * Mutability of bpf_dynptr is at two levels, one is at the level of struct - * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct - * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can - * mutate the view of the dynptr and also possibly destroy it. In the latter - * case, it cannot mutate the bpf_dynptr itself but it can still mutate the - * memory that dynptr points to. - * - * The verifier will keep track both levels of mutation (bpf_dynptr's in - * reg->type and the memory's in reg->dynptr.type), but there is no support for - * readonly dynptr view yet, hence only the first case is tracked and checked. - * - * This is consistent with how C applies the const modifier to a struct object, - * where the pointer itself inside bpf_dynptr becomes const but not what it - * points to. - * - * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument - * type, and declare it as 'const struct bpf_dynptr *' in their prototype. + * Mutability of bpf_dynptr is at two levels: the dynptr and the memory the + * dynptr points to. At the first level, the verifier will make sure a + * CONST_PTR_TO_DYNPTR cannot be reinitialized or destroyed. The mutability of + * a dynptr's view (i.e., start and offset) is not tracked as there is not such + * use case. The second level is tracked using the upper bit of bpf_dynptr->size + * and checked dynamically during runtime. */ -static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx, - enum bpf_arg_type arg_type, int clone_ref_obj_id) +static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + argno_t argno, int insn_idx, enum bpf_arg_type arg_type, + struct ref_obj_desc *ref_obj, struct bpf_dynptr_desc *dynptr) { - struct bpf_reg_state *reg = reg_state(env, regno); - int err; + int spi, err = 0; if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, - "arg#%d expected pointer to stack or const struct bpf_dynptr\n", - regno - 1); + "%s expected pointer to stack or const struct bpf_dynptr\n", + reg_arg_name(env, argno)); return -EINVAL; } - /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an - * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): - */ - if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { - verifier_bug(env, "misconfigured dynptr helper type flags"); - return -EFAULT; - } - /* MEM_UNINIT - Points to memory that is an appropriate candidate for * constructing a mutable bpf_dynptr object. * @@ -7480,13 +7241,12 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn * pointing to a region of at least 16 bytes which doesn't * contain an existing bpf_dynptr. * - * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be - * mutated or destroyed. However, the memory it points to - * may be mutated. + * OBJ_RELEASE - Points to a initialized bpf_dynptr that will be + * destroyed. * - * None - Points to a initialized dynptr that can be mutated and - * destroyed, including mutation of the memory it points - * to. + * None - Points to a initialized dynptr that cannot be + * reinitialized or destroyed. However, the view of the + * dynptr and the memory it points to may be mutated. */ if (arg_type & MEM_UNINIT) { int i; @@ -7498,45 +7258,58 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { - err = check_mem_access(env, insn_idx, regno, + err = check_mem_access(env, insn_idx, reg, argno, i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; } - err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, clone_ref_obj_id); - } else /* MEM_RDONLY and None case from above */ { + err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx, ref_obj, dynptr); + } else /* OBJ_RELEASE and None case from above */ { /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ - if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { - verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); + if (reg->type == CONST_PTR_TO_DYNPTR && (arg_type & OBJ_RELEASE)) { + verbose(env, "CONST_PTR_TO_DYNPTR cannot be released\n"); return -EINVAL; } if (!is_dynptr_reg_valid_init(env, reg)) { - verbose(env, - "Expected an initialized dynptr as arg #%d\n", - regno - 1); + verbose(env, "Expected an initialized dynptr as %s\n", + reg_arg_name(env, argno)); return -EINVAL; } - /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ - if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { + /* Fold modifiers (in this case, OBJ_RELEASE) when checking expected type */ + if (!is_dynptr_type_expected(env, reg, arg_type & ~OBJ_RELEASE)) { verbose(env, - "Expected a dynptr of type %s as arg #%d\n", - dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1); + "Expected a dynptr of type %s as %s\n", + dynptr_type_str(arg_to_dynptr_type(arg_type)), + reg_arg_name(env, argno)); return -EINVAL; } - err = mark_dynptr_read(env, reg); - } - return err; -} + if (reg->type != CONST_PTR_TO_DYNPTR) { + struct bpf_func_state *state = bpf_func(env, reg); -static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) -{ - struct bpf_func_state *state = bpf_func(env, reg); + spi = dynptr_get_spi(env, reg); + if (spi < 0) + return spi; - return state->stack[spi].spilled_ptr.ref_obj_id; + /* + * For CONST_PTR_TO_DYNPTR, reg is already scratched by check_reg_arg + * in check_helper_call and mark_btf_func_reg_size in check_kfunc_call. + */ + mark_stack_slots_scratched(env, spi, BPF_DYNPTR_NR_SLOTS); + + reg = &state->stack[spi].spilled_ptr; + } + + if (dynptr) { + dynptr->type = reg->dynptr.type; + dynptr->id = reg->id; + dynptr->parent_id = reg->parent_id; + } + } + return err; } static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta) @@ -7568,15 +7341,17 @@ static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg_idx, return btf_param_match_suffix(meta->btf, arg, "__iter"); } -static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, +static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, int insn_idx, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_func_state *state = bpf_func(env, reg); const struct btf_type *t; + u32 arg_idx = arg_idx_from_argno(argno); int spi, err, i, nr_slots, btf_id; if (reg->type != PTR_TO_STACK) { - verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1); + verbose(env, "%s expected pointer to an iterator on stack\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -7586,9 +7361,10 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id * to any kfunc, if arg has "__iter" suffix, we need to be a bit more * conservative here. */ - btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1); + btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, arg_idx); if (btf_id < 0) { - verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1); + verbose(env, "expected valid iter pointer as %s\n", + reg_arg_name(env, argno)); return -EINVAL; } t = btf_type_by_id(meta->btf, btf_id); @@ -7597,13 +7373,13 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id if (is_iter_new_kfunc(meta)) { /* bpf_iter_<type>_new() expects pointer to uninit iter state */ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) { - verbose(env, "expected uninitialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno - 1); + verbose(env, "expected uninitialized iter_%s as %s\n", + iter_type_str(meta->btf, btf_id), reg_arg_name(env, argno)); return -EINVAL; } for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) { - err = check_mem_access(env, insn_idx, regno, + err = check_mem_access(env, insn_idx, reg, argno, i, BPF_DW, BPF_WRITE, -1, false, false); if (err) return err; @@ -7621,8 +7397,8 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id case 0: break; case -EINVAL: - verbose(env, "expected an initialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno - 1); + verbose(env, "expected an initialized iter_%s as %s\n", + iter_type_str(meta->btf, btf_id), reg_arg_name(env, argno)); return err; case -EPROTO: verbose(env, "expected an RCU CS when using %s\n", meta->func_name); @@ -7635,14 +7411,12 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id if (spi < 0) return spi; - err = mark_iter_read(env, reg, spi, nr_slots); - if (err) - return err; + mark_stack_slots_scratched(env, spi, nr_slots); /* remember meta->iter info for process_iter_next_call() */ meta->iter.spi = spi; meta->iter.frameno = reg->frameno; - meta->ref_obj_id = iter_ref_obj_id(env, reg, spi); + update_ref_obj(&meta->ref_obj, &state->stack[spi].spilled_ptr); if (is_iter_destroy_kfunc(meta)) { err = unmark_stack_slots_iter(env, reg, nr_slots); @@ -8042,12 +7816,11 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_DYNPTR] = &dynptr_types, }; -static int check_reg_type(struct bpf_verifier_env *env, u32 regno, +static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, enum bpf_arg_type arg_type, const u32 *arg_btf_id, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_reg_type expected, type = reg->type; const struct bpf_reg_types *compatible; int i, j, err; @@ -8078,7 +7851,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, type &= ~DYNPTR_TYPE_FLAG_MASK; /* Local kptr types are allowed as the source argument of bpf_kptr_xchg */ - if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) { + if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && reg_from_argno(argno) == BPF_REG_2) { type &= ~MEM_ALLOC; type &= ~MEM_PERCPU; } @@ -8092,7 +7865,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, goto found; } - verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type)); + verbose(env, "%s type=%s expected=", reg_arg_name(env, argno), reg_type_str(env, reg->type)); for (j = 0; j + 1 < i; j++) verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); @@ -8105,9 +7878,9 @@ found: if (compatible == &mem_types) { if (!(arg_type & MEM_RDONLY)) { verbose(env, - "%s() may write into memory pointed by R%d type=%s\n", + "%s() may write into memory pointed by %s type=%s\n", func_id_name(meta->func_id), - regno, reg_type_str(env, reg->type)); + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EACCES; } return 0; @@ -8130,7 +7903,8 @@ found: if (type_may_be_null(reg->type) && (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) { - verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno); + verbose(env, "Possibly NULL pointer passed to helper %s\n", + reg_arg_name(env, argno)); return -EACCES; } @@ -8143,25 +7917,26 @@ found: } if (meta->func_id == BPF_FUNC_kptr_xchg) { - if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) + if (map_kptr_match_type(env, meta->kptr_field, reg, reg_from_argno(argno))) return -EACCES; } else { if (arg_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); - verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n", - regno); + verbose(env, "%s has non-overwritten BPF_PTR_POISON type\n", + reg_arg_name(env, argno)); return -EACCES; } - err = __check_ptr_off_reg(env, reg, regno, true); + err = __check_ptr_off_reg(env, reg, argno, true); if (err) return err; if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->var_off.value, btf_vmlinux, *arg_btf_id, strict_type_match)) { - verbose(env, "R%d is of type %s but %s is expected\n", - regno, btf_type_name(reg->btf, reg->btf_id), + verbose(env, "%s is of type %s but %s is expected\n", + reg_arg_name(env, argno), + btf_type_name(reg->btf, reg->btf_id), btf_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } @@ -8178,8 +7953,11 @@ found: return -EFAULT; } /* Check if local kptr in src arg matches kptr in dst arg */ - if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) { - if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) + if (meta->func_id == BPF_FUNC_kptr_xchg) { + int regno = reg_from_argno(argno); + + if (regno == BPF_REG_2 && + map_kptr_match_type(env, meta->kptr_field, reg, regno)) return -EACCES; } break; @@ -8213,7 +7991,7 @@ reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields) } static int check_func_arg_reg_off(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, + const struct bpf_reg_state *reg, argno_t argno, enum bpf_arg_type arg_type) { u32 type = reg->type; @@ -8221,7 +7999,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, /* When referenced register is passed to release function, its fixed * offset must be 0. * - * We will check arg_type_is_release reg has ref_obj_id when storing + * We will check arg_type_is_release reg has id when storing * meta->release_regno. */ if (arg_type_is_release(arg_type)) { @@ -8239,8 +8017,8 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * to give the user a better error message. */ if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) { - verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", - regno); + verbose(env, "%s must have zero offset when passed to release func or trusted arg to kfunc\n", + reg_arg_name(env, argno)); return -EINVAL; } } @@ -8276,7 +8054,7 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we * still need to do checks instead of returning. */ - return __check_ptr_off_reg(env, reg, regno, true); + return __check_ptr_off_reg(env, reg, argno, true); case PTR_TO_CTX: /* * Allow fixed and variable offsets for syscall context, but @@ -8288,78 +8066,12 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, return 0; fallthrough; default: - return __check_ptr_off_reg(env, reg, regno, false); + return __check_ptr_off_reg(env, reg, argno, false); } } -static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env, - const struct bpf_func_proto *fn, - struct bpf_reg_state *regs) -{ - struct bpf_reg_state *state = NULL; - int i; - - for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) - if (arg_type_is_dynptr(fn->arg_type[i])) { - if (state) { - verbose(env, "verifier internal error: multiple dynptr args\n"); - return NULL; - } - state = ®s[BPF_REG_1 + i]; - } - - if (!state) - verbose(env, "verifier internal error: no dynptr arg found\n"); - - return state; -} - -static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->id; - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; - return state->stack[spi].spilled_ptr.id; -} - -static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->ref_obj_id; - spi = dynptr_get_spi(env, reg); - if (spi < 0) - return spi; - return state->stack[spi].spilled_ptr.ref_obj_id; -} - -static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, - struct bpf_reg_state *reg) -{ - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - if (reg->type == CONST_PTR_TO_DYNPTR) - return reg->dynptr.type; - - spi = bpf_get_spi(reg->var_off.value); - if (spi < 0) { - verbose(env, "verifier internal error: invalid spi when querying dynptr type\n"); - return BPF_DYNPTR_TYPE_INVALID; - } - - return state->stack[spi].spilled_ptr.dynptr.type; -} - -static int check_reg_const_str(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno) +static int check_arg_const_str(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, argno_t argno) { struct bpf_map *map = reg->map_ptr; int err; @@ -8371,17 +8083,18 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EINVAL; if (map->map_type == BPF_MAP_TYPE_INSN_ARRAY) { - verbose(env, "R%d points to insn_array map which cannot be used as const string\n", regno); + verbose(env, "%s points to insn_array map which cannot be used as const string\n", + reg_arg_name(env, argno)); return -EACCES; } if (!bpf_map_is_rdonly(map)) { - verbose(env, "R%d does not point to a readonly map'\n", regno); + verbose(env, "%s does not point to a readonly map'\n", reg_arg_name(env, argno)); return -EACCES; } if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d is not a constant address'\n", regno); + verbose(env, "%s is not a constant address'\n", reg_arg_name(env, argno)); return -EACCES; } @@ -8390,7 +8103,7 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EACCES; } - err = check_map_access(env, regno, 0, + err = check_map_access(env, reg, argno, 0, map->value_size - reg->var_off.value, false, ACCESS_HELPER); if (err) @@ -8472,7 +8185,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, return 0; } -static bool can_elide_value_nullness(enum bpf_map_type type); +static bool can_elide_value_nullness(const struct bpf_map *map); static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, @@ -8482,6 +8195,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, u32 regno = BPF_REG_1 + arg; struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_arg_type arg_type = fn->arg_type[arg]; + argno_t argno = argno_from_arg(arg + 1); enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; u32 key_size; @@ -8526,56 +8240,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; - err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); + err = check_reg_type(env, reg, argno_from_reg(regno), arg_type, arg_btf_id, meta); if (err) return err; - err = check_func_arg_reg_off(env, reg, regno, arg_type); + err = check_func_arg_reg_off(env, reg, argno_from_reg(regno), arg_type); if (err) return err; skip_type_check: - if (arg_type_is_release(arg_type)) { - if (arg_type_is_dynptr(arg_type)) { - struct bpf_func_state *state = bpf_func(env, reg); - int spi; - - /* Only dynptr created on stack can be released, thus - * the get_spi and stack state checks for spilled_ptr - * should only be done before process_dynptr_func for - * PTR_TO_STACK. - */ - if (reg->type == PTR_TO_STACK) { - spi = dynptr_get_spi(env, reg); - if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) { - verbose(env, "arg %d is an unacquired reference\n", regno); - return -EINVAL; - } - } else { - verbose(env, "cannot release unowned const bpf_dynptr\n"); - return -EINVAL; - } - } else if (!reg->ref_obj_id && !bpf_register_is_null(reg)) { - verbose(env, "R%d must be referenced when passed to release function\n", - regno); - return -EINVAL; - } - if (meta->release_regno) { - verifier_bug(env, "more than one release argument"); - return -EFAULT; - } - meta->release_regno = regno; + if (arg_type_is_release(arg_type) && !arg_type_is_dynptr(arg_type) && + !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { + verbose(env, "release helper %s expects referenced PTR_TO_BTF_ID passed to %s\n", + func_id_name(meta->func_id), reg_arg_name(env, argno)); + return -EINVAL; } - if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { - if (meta->ref_obj_id) { - verbose(env, "more than one arg with ref_obj_id R%d %u %u", - regno, reg->ref_obj_id, - meta->ref_obj_id); - return -EACCES; - } - meta->ref_obj_id = reg->ref_obj_id; - } + if (reg_is_referenced(env, reg)) + update_ref_obj(&meta->ref_obj, reg); switch (base_type(arg_type)) { case ARG_CONST_MAP_PTR: @@ -8619,10 +8301,10 @@ skip_type_check: return -EFAULT; } key_size = meta->map.ptr->key_size; - err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); + err = check_helper_mem_access(env, reg, argno_from_reg(regno), key_size, BPF_READ, false, NULL); if (err) return err; - if (can_elide_value_nullness(meta->map.ptr->map_type)) { + if (can_elide_value_nullness(meta->map.ptr)) { err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); if (err < 0) { meta->const_map_key = -1; @@ -8646,7 +8328,7 @@ skip_type_check: return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, meta->map.ptr->value_size, + err = check_helper_mem_access(env, reg, argno_from_reg(regno), meta->map.ptr->value_size, arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; @@ -8664,11 +8346,11 @@ skip_type_check: return -EACCES; } if (meta->func_id == BPF_FUNC_spin_lock) { - err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); + err = process_spin_lock(env, reg, argno_from_reg(regno), PROCESS_SPIN_LOCK); if (err) return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - err = process_spin_lock(env, regno, 0); + err = process_spin_lock(env, reg, argno_from_reg(regno), 0); if (err) return err; } else { @@ -8677,7 +8359,7 @@ skip_type_check: } break; case ARG_PTR_TO_TIMER: - err = process_timer_helper(env, regno, meta); + err = process_timer_helper(env, reg, argno_from_reg(regno), meta); if (err) return err; break; @@ -8690,7 +8372,7 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, regno, fn->arg_size[arg], + err = check_helper_mem_access(env, reg, argno_from_reg(regno), fn->arg_size[arg], arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); if (err) @@ -8700,19 +8382,22 @@ skip_type_check: } break; case ARG_CONST_SIZE: - err = check_mem_size_reg(env, reg, regno, + err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, argno_from_reg(regno - 1), + argno_from_reg(regno), fn->arg_type[arg - 1] & MEM_WRITE ? BPF_WRITE : BPF_READ, false, meta); break; case ARG_CONST_SIZE_OR_ZERO: - err = check_mem_size_reg(env, reg, regno, + err = check_mem_size_reg(env, reg_state(env, regno - 1), reg, argno_from_reg(regno - 1), + argno_from_reg(regno), fn->arg_type[arg - 1] & MEM_WRITE ? BPF_WRITE : BPF_READ, true, meta); break; case ARG_PTR_TO_DYNPTR: - err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); + err = process_dynptr_func(env, reg, argno_from_reg(regno), insn_idx, arg_type, &meta->ref_obj, + &meta->dynptr); if (err) return err; break; @@ -8729,7 +8414,7 @@ skip_type_check: break; case ARG_PTR_TO_CONST_STR: { - err = check_reg_const_str(env, reg, regno); + err = check_arg_const_str(env, reg, argno_from_reg(regno)); if (err) return err; break; @@ -9131,11 +8816,29 @@ static bool check_mem_arg_rw_flag_ok(const struct bpf_func_proto *fn) return true; } -static int check_func_proto(const struct bpf_func_proto *fn) +static bool check_proto_release_reg(const struct bpf_func_proto *fn, struct bpf_call_arg_meta *meta) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { + enum bpf_arg_type arg_type = fn->arg_type[i]; + + if (arg_type_is_release(arg_type)) { + if (meta->release_regno) + return false; + meta->release_regno = i + 1; + } + } + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn, struct bpf_call_arg_meta *meta) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && check_mem_arg_rw_flag_ok(fn) && + check_proto_release_reg(fn, meta) && check_btf_id_ok(fn) ? 0 : -EINVAL; } @@ -9182,14 +8885,14 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range reg->range = AT_PKT_END; } -static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id) +static int release_reference_nomark(struct bpf_verifier_state *state, int id) { int i; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_PTR) continue; - if (state->refs[i].id == ref_obj_id) { + if (state->refs[i].id == id) { release_reference_state(state, i); return 0; } @@ -9197,26 +8900,83 @@ static int release_reference_nomark(struct bpf_verifier_state *state, int ref_ob return -EINVAL; } -/* The pointer with the specified id has released its reference to kernel - * resources. Identify all copies of the same pointer and clear the reference. - * - * This is the release function corresponding to acquire_reference(). Idempotent. - */ -static int release_reference(struct bpf_verifier_env *env, int ref_obj_id) +static int idstack_push(struct bpf_idmap *idmap, u32 id) +{ + int i; + + if (!id) + return 0; + + for (i = 0; i < idmap->cnt; i++) + if (idmap->map[i].old == id) + return 0; + + if (WARN_ON_ONCE(idmap->cnt >= BPF_ID_MAP_SIZE)) + return -EFAULT; + + idmap->map[idmap->cnt++].old = id; + return 0; +} + +static int idstack_pop(struct bpf_idmap *idmap) +{ + if (!idmap->cnt) + return 0; + + return idmap->map[--idmap->cnt].old; +} + +/* Release id and objects derived from it iteratively in a DFS manner */ +static int release_reference(struct bpf_verifier_env *env, int id) { + u32 mask = (1 << STACK_SPILL) | (1 << STACK_DYNPTR); struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_idmap *idstack = &env->idmap_scratch; + struct bpf_stack_state *stack; struct bpf_func_state *state; struct bpf_reg_state *reg; - int err; + int i, err; - err = release_reference_nomark(vstate, ref_obj_id); + idstack->cnt = 0; + err = idstack_push(idstack, id); if (err) return err; - bpf_for_each_reg_in_vstate(vstate, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) - mark_reg_invalid(env, reg); - })); + if (find_reference_state(vstate, id)) + WARN_ON_ONCE(release_reference_nomark(vstate, id)); + + while ((id = idstack_pop(idstack))) { + /* + * Child references are inaccessible after parent is released, + * any child references that exist at this point are a leak. + */ + for (i = 0; i < vstate->acquired_refs; i++) { + if (vstate->refs[i].type != REF_TYPE_PTR) + continue; + if (vstate->refs[i].parent_id != id) + continue; + verbose(env, "Leaking reference id=%d alloc_insn=%d. Release it first.\n", + vstate->refs[i].id, vstate->refs[i].insn_idx); + return -EINVAL; + } + + bpf_for_each_reg_in_vstate_mask(vstate, state, reg, stack, mask, ({ + if (reg->id != id && reg->parent_id != id) + continue; + + /* Free objects derived from the current object */ + if (reg->parent_id == id) { + err = idstack_push(idstack, reg->id); + if (err) + return err; + } + + if (!stack || stack->slot_type[BPF_REG_SIZE - 1] == STACK_SPILL) + mark_reg_invalid(env, reg); + else if (stack->slot_type[BPF_REG_SIZE - 1] == STACK_DYNPTR) + invalidate_dynptr(env, stack); + })); + } return 0; } @@ -9232,6 +8992,42 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env) })); } +static void invalidate_rcu_protected_refs(struct bpf_verifier_env *env) +{ + struct bpf_stack_state *stack; + struct bpf_func_state *state; + struct bpf_reg_state *reg; + u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); + + bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, stack, clear_mask, ({ + if (reg->type & MEM_RCU) { + reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); + reg->type |= PTR_UNTRUSTED; + } + })); +} + +static int ref_convert_alloc_rcu_protected(struct bpf_verifier_env *env, u32 id) +{ + struct bpf_func_state *state; + struct bpf_reg_state *reg; + int err; + + err = release_reference_nomark(env->cur_state, id); + + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->id != id) + continue; + if ((reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { + reg->id = 0; + reg->type &= ~MEM_ALLOC; + reg->type |= MEM_RCU; + } + })); + + return err; +} + static void clear_caller_saved_regs(struct bpf_verifier_env *env, struct bpf_reg_state *regs) { @@ -9244,6 +9040,15 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env, } } +static void invalidate_outgoing_stack_args(const struct bpf_verifier_env *env, + struct bpf_func_state *state) +{ + int i, nslots = state->out_stack_arg_cnt; + + for (i = 0; i < nslots; i++) + bpf_mark_reg_not_init(env, &state->stack_arg_regs[i]); +} + typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, @@ -9306,11 +9111,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs) { struct bpf_subprog_info *sub = subprog_info(env, subprog); + struct bpf_func_state *caller = cur_func(env); struct bpf_verifier_log *log = &env->log; + struct ref_obj_desc ref_obj = {}; u32 i; - int ret; + int ret, err; ret = btf_prepare_func_args(env, subprog); + if (ret) { + if (bpf_in_stack_arg_cnt(sub) > 0) { + err = check_outgoing_stack_args(env, caller, sub->arg_cnt); + if (err) + return err; + } + return ret; + } + + ret = check_outgoing_stack_args(env, caller, sub->arg_cnt); if (ret) return ret; @@ -9318,13 +9135,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * verifier sees. */ for (i = 0; i < sub->arg_cnt; i++) { - u32 regno = i + 1; - struct bpf_reg_state *reg = ®s[regno]; + argno_t argno = argno_from_arg(i + 1); + struct bpf_reg_state *reg = get_func_arg_reg(caller, regs, i); struct bpf_subprog_arg_info *arg = &sub->args[i]; if (arg->arg_type == ARG_ANYTHING) { if (reg->type != SCALAR_VALUE) { - bpf_log(log, "R%d is not a scalar\n", regno); + bpf_log(log, "%s is not a scalar\n", reg_arg_name(env, argno)); return -EINVAL; } } else if (arg->arg_type & PTR_UNTRUSTED) { @@ -9334,24 +9151,26 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * invalid memory access. */ } else if (arg->arg_type == ARG_PTR_TO_CTX) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_CTX); + ret = check_func_arg_reg_off(env, reg, argno, ARG_PTR_TO_CTX); if (ret < 0) return ret; /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ if (reg->type != PTR_TO_CTX) { - bpf_log(log, "arg#%d expects pointer to ctx\n", i); + bpf_log(log, "%s expects pointer to ctx\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); + ret = check_func_arg_reg_off(env, reg, argno, ARG_DONTCARE); if (ret < 0) return ret; - if (check_mem_reg(env, reg, regno, arg->mem_size)) + if (check_mem_reg(env, reg, argno, arg->mem_size)) return -EINVAL; if (!(arg->arg_type & PTR_MAYBE_NULL) && (reg->type & PTR_MAYBE_NULL)) { - bpf_log(log, "arg#%d is expected to be non-NULL\n", i); + bpf_log(log, "%s is expected to be non-NULL\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (base_type(arg->arg_type) == ARG_PTR_TO_ARENA) { @@ -9363,15 +9182,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * run-time debug nightmare. */ if (reg->type != PTR_TO_ARENA && reg->type != SCALAR_VALUE) { - bpf_log(log, "R%d is not a pointer to arena or scalar.\n", regno); + bpf_log(log, "%s is not a pointer to arena or scalar.\n", + reg_arg_name(env, argno)); return -EINVAL; } - } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { - ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR); + } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) { + ret = check_func_arg_reg_off(env, reg, argno, ARG_PTR_TO_DYNPTR); if (ret) return ret; - ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0); + ret = process_dynptr_func(env, reg, argno, -1, arg->arg_type, &ref_obj, NULL); if (ret) return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { @@ -9382,12 +9202,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ - err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta); - err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); + err = check_reg_type(env, reg, argno, arg->arg_type, &arg->btf_id, &meta); + err = err ?: check_func_arg_reg_off(env, reg, argno, arg->arg_type); if (err) return err; } else { - verifier_bug(env, "unrecognized arg#%d type %d", i, arg->arg_type); + verifier_bug(env, "unrecognized %s type %d", + reg_arg_name(env, argno), arg->arg_type); return -EFAULT; } } @@ -9499,10 +9320,15 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins return 0; } +static int process_bpf_exit_full(struct bpf_verifier_env *env, + bool *do_print_state, bool exception_exit); + static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; + struct bpf_subprog_info *caller_info; + u16 callee_incoming, stack_arg_cnt; struct bpf_func_state *caller; int err, subprog, target_insn; @@ -9545,6 +9371,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* mark global subprog for verifying after main prog */ subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); + invalidate_outgoing_stack_args(env, cur_func(env)); /* All non-void global functions return a 64-bit SCALAR_VALUE. */ if (!subprog_returns_void(env, subprog)) { @@ -9552,10 +9379,31 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; } + if (env->subprog_info[subprog].might_throw) { + struct bpf_verifier_state *branch; + + branch = push_stack(env, *insn_idx + 1, *insn_idx, false); + if (IS_ERR(branch)) { + verbose(env, "failed to push state for global subprog exception path\n"); + return PTR_ERR(branch); + } + return process_bpf_exit_full(env, NULL, true); + } + /* continue with next insn after call */ return 0; } + /* + * Track caller's total stack arg count (incoming + max outgoing). + * This is needed so the JIT knows how much stack arg space to allocate. + */ + caller_info = &env->subprog_info[caller->subprogno]; + callee_incoming = bpf_in_stack_arg_cnt(&env->subprog_info[subprog]); + stack_arg_cnt = bpf_in_stack_arg_cnt(caller_info) + callee_incoming; + if (stack_arg_cnt > caller_info->stack_arg_cnt) + caller_info->stack_arg_cnt = stack_arg_cnt; + /* for regular function entry setup new frame and continue * from that frame. */ @@ -9839,9 +9687,9 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) { if (range.return_32bit) - return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval; + return range.minval <= reg_s32_min(reg) && reg_s32_max(reg) <= range.maxval; else - return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; + return range.minval <= reg_smin(reg) && reg_smax(reg) <= range.maxval; } static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) @@ -9913,6 +9761,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) * bpf_throw, this will be done by copy_verifier_state for extra frames. */ free_func_state(callee); state->frame[state->curframe--] = NULL; + invalidate_outgoing_stack_args(env, caller); /* for callbacks widen imprecise scalars to make programs like below verify: * @@ -9939,7 +9788,9 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, int func_id, struct bpf_call_arg_meta *meta) { + struct bpf_retval_range range; struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); if (ret_type != RET_INTEGER) return 0; @@ -9950,21 +9801,36 @@ static int do_refine_retval_range(struct bpf_verifier_env *env, case BPF_FUNC_probe_read_str: case BPF_FUNC_probe_read_kernel_str: case BPF_FUNC_probe_read_user_str: - ret_reg->smax_value = meta->msize_max_value; - ret_reg->s32_max_value = meta->msize_max_value; - ret_reg->smin_value = -MAX_ERRNO; - ret_reg->s32_min_value = -MAX_ERRNO; + reg_set_srange64(ret_reg, -MAX_ERRNO, meta->msize_max_value); + reg_set_srange32(ret_reg, -MAX_ERRNO, meta->msize_max_value); reg_bounds_sync(ret_reg); break; case BPF_FUNC_get_smp_processor_id: - ret_reg->umax_value = nr_cpu_ids - 1; - ret_reg->u32_max_value = nr_cpu_ids - 1; - ret_reg->smax_value = nr_cpu_ids - 1; - ret_reg->s32_max_value = nr_cpu_ids - 1; - ret_reg->umin_value = 0; - ret_reg->u32_min_value = 0; - ret_reg->smin_value = 0; - ret_reg->s32_min_value = 0; + reg_set_urange64(ret_reg, 0, nr_cpu_ids - 1); + reg_set_urange32(ret_reg, 0, nr_cpu_ids - 1); + reg_bounds_sync(ret_reg); + break; + case BPF_FUNC_get_retval: + /* + * bpf_get_retval may see arbitrary value passed by bpf_prog_run_array_cg for + * CGROUP_GETSOCKOPT type. + */ + if (prog_type == BPF_PROG_TYPE_CGROUP_SOCKOPT && + env->prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT) + break; + + if (prog_type == BPF_PROG_TYPE_LSM && + env->prog->expected_attach_type == BPF_LSM_CGROUP) { + if (!env->prog->aux->attach_func_proto->type) + break; + bpf_lsm_get_retval_range(env->prog, &range); + } else { + range.minval = -MAX_ERRNO; + range.maxval = 0; + } + + reg_set_srange64(ret_reg, range.minval, range.maxval); + reg_set_srange32(ret_reg, range.minval, range.maxval); reg_bounds_sync(ret_reg); break; } @@ -10073,7 +9939,7 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi * kernel. Type checks are performed later in check_return_code. */ if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit && - reg->ref_obj_id == state->refs[i].id) + reg->id == state->refs[i].id) continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); @@ -10208,13 +10074,16 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno state->callback_subprogno == subprogno); } -/* Returns whether or not the given map type can potentially elide +/* Returns whether or not the given map can potentially elide * lookup return value nullness check. This is possible if the key * is statically known. */ -static bool can_elide_value_nullness(enum bpf_map_type type) +static bool can_elide_value_nullness(const struct bpf_map *map) { - switch (type) { + if (map->map_flags & BPF_F_INNER_MAP) + return false; + + switch (map->map_type) { case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PERCPU_ARRAY: return true; @@ -10259,6 +10128,24 @@ static const char *non_sleepable_context_description(struct bpf_verifier_env *en return "non-sleepable prog"; } +static int release_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + bool convert_rcu, bool release_dynptr) +{ + int err = -EINVAL; + + if (bpf_register_is_null(reg)) + return 0; + + if (release_dynptr) + err = unmark_stack_slots_dynptr(env, reg); + else if (convert_rcu) + err = ref_convert_alloc_rcu_protected(env, reg->id); + else if (reg_is_referenced(env, reg)) + err = release_reference(env, reg->id); + + return err; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -10308,7 +10195,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn); + err = check_func_proto(fn, &meta); if (err) { verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id); return err; @@ -10340,55 +10227,26 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (err) return err; + regs = cur_regs(env); + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, + err = check_mem_access(env, insn_idx, regs + meta.regno, argno_from_reg(meta.regno), i, BPF_B, BPF_WRITE, -1, false, false); if (err) return err; } - regs = cur_regs(env); - if (meta.release_regno) { - err = -EINVAL; - if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { - err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - } else if (func_id == BPF_FUNC_kptr_xchg && meta.ref_obj_id) { - u32 ref_obj_id = meta.ref_obj_id; - bool in_rcu = in_rcu_cs(env); - struct bpf_func_state *state; - struct bpf_reg_state *reg; - - err = release_reference_nomark(env->cur_state, ref_obj_id); - if (!err) { - bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) { - if (in_rcu && (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU)) { - reg->ref_obj_id = 0; - reg->type &= ~MEM_ALLOC; - reg->type |= MEM_RCU; - } else { - mark_reg_invalid(env, reg); - } - } - })); - } - } else if (meta.ref_obj_id) { - err = release_reference(env, meta.ref_obj_id); - } else if (bpf_register_is_null(®s[meta.release_regno])) { - /* meta.ref_obj_id can only be 0 if register that is meant to be - * released is NULL, which must be > R0. - */ - err = 0; - } - if (err) { - verbose(env, "func %s#%d reference has not been acquired before\n", - func_id_name(func_id), func_id); + struct bpf_reg_state *reg = ®s[meta.release_regno]; + bool convert_rcu = (func_id == BPF_FUNC_kptr_xchg) && in_rcu_cs(env) && + (reg->type & MEM_ALLOC) && (reg->type & MEM_PERCPU); + + err = release_reg(env, reg, convert_rcu, !!meta.dynptr.id); + if (err) return err; - } } switch (func_id) { @@ -10429,7 +10287,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = mark_chain_precision(env, BPF_REG_1); if (err) return err; - if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) { + if (cur_func(env)->callback_depth < reg_umax(®s[BPF_REG_1])) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_loop_callback_state); } else { @@ -10447,6 +10305,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } break; case BPF_FUNC_set_retval: + { + struct bpf_retval_range range = { + .minval = -MAX_ERRNO, + .maxval = 0, + .return_32bit = true + }; + struct bpf_reg_state *r1 = ®s[BPF_REG_1]; + + if (r1->type != SCALAR_VALUE) { + verbose(env, "R1 is not a scalar\n"); + return -EINVAL; + } + + /* CGROUP_GETSOCKOPT is allowed to return arbitrary value */ + if (prog_type == BPF_PROG_TYPE_CGROUP_SOCKOPT && + env->prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT) + break; + if (prog_type == BPF_PROG_TYPE_LSM && env->prog->expected_attach_type == BPF_LSM_CGROUP) { if (!env->prog->aux->attach_func_proto->type) { @@ -10456,54 +10332,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n"); return -EINVAL; } - } - break; - case BPF_FUNC_dynptr_data: - { - struct bpf_reg_state *reg; - int id, ref_obj_id; - - reg = get_dynptr_arg_reg(env, fn, regs); - if (!reg) - return -EFAULT; - - - if (meta.dynptr_id) { - verifier_bug(env, "meta.dynptr_id already set"); - return -EFAULT; - } - if (meta.ref_obj_id) { - verifier_bug(env, "meta.ref_obj_id already set"); - return -EFAULT; + bpf_lsm_get_retval_range(env->prog, &range); } - id = dynptr_id(env, reg); - if (id < 0) { - verifier_bug(env, "failed to obtain dynptr id"); - return id; - } + err = mark_chain_precision(env, BPF_REG_1); + if (err) + return err; - ref_obj_id = dynptr_ref_obj_id(env, reg); - if (ref_obj_id < 0) { - verifier_bug(env, "failed to obtain dynptr ref_obj_id"); - return ref_obj_id; + if (!retval_range_within(range, r1)) { + verbose_invalid_scalar(env, r1, range, "At bpf_set_retval", "R1"); + return -EINVAL; } - meta.dynptr_id = id; - meta.ref_obj_id = ref_obj_id; - break; } case BPF_FUNC_dynptr_write: { - enum bpf_dynptr_type dynptr_type; - struct bpf_reg_state *reg; + enum bpf_dynptr_type dynptr_type = meta.dynptr.type; - reg = get_dynptr_arg_reg(env, fn, regs); - if (!reg) - return -EFAULT; - - dynptr_type = dynptr_get_type(env, reg); if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) return -EFAULT; @@ -10547,6 +10393,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn bpf_mark_reg_not_init(env, ®s[caller_saved[i]]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } + invalidate_outgoing_stack_args(env, cur_func(env)); /* helper call returns 64-bit value. */ regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; @@ -10576,7 +10423,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } if (func_id == BPF_FUNC_map_lookup_elem && - can_elide_value_nullness(meta.map.ptr->map_type) && + can_elide_value_nullness(meta.map.ptr) && meta.const_map_key >= 0 && meta.const_map_key < meta.map.ptr->max_entries) ret_flag &= ~PTR_MAYBE_NULL; @@ -10688,29 +10535,45 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; - if (helper_multiple_ref_obj_use(func_id, meta.map.ptr)) { - verifier_bug(env, "func %s#%d sets ref_obj_id more than once", - func_id_name(func_id), func_id); - return -EFAULT; - } + if (is_ptr_cast_function(func_id) && + find_reference_state(env->cur_state, meta.ref_obj.id)) { + struct bpf_verifier_state *branch; + struct bpf_reg_state *r0; - if (is_dynptr_ref_function(func_id)) - regs[BPF_REG_0].dynptr_id = meta.dynptr_id; + err = validate_ref_obj(env, &meta.ref_obj); + if (err) + return err; - if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) { - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + /* + * In order for a release of any of the original or cast pointers + * to invalidate all other pointers, reuse the same reference id for + * the cast result. + * This reference id can't be used for nullness propagation, + * as cast might return NULL for a non-NULL input. + * Hence, explore the NULL case as a separate branch. + */ + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (IS_ERR(branch)) + return PTR_ERR(branch); + + r0 = &branch->frame[branch->curframe]->regs[BPF_REG_0]; + __mark_reg_known_zero(r0); + r0->type = SCALAR_VALUE; + + regs[BPF_REG_0].type &= ~PTR_MAYBE_NULL; + regs[BPF_REG_0].id = meta.ref_obj.id; } else if (is_acquire_function(func_id, meta.map.ptr)) { - int id = acquire_reference(env, insn_idx); + int id = acquire_reference(env, insn_idx, 0); if (id < 0) return id; - /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = id; - /* For release_reference() */ - regs[BPF_REG_0].ref_obj_id = id; } + if (func_id == BPF_FUNC_dynptr_data) + regs[BPF_REG_0].parent_id = meta.dynptr.id; + err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); if (err) return err; @@ -10806,7 +10669,6 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) return meta->kfunc_flags & KF_RELEASE; } - static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_DESTRUCTIVE; @@ -10883,6 +10745,11 @@ static bool is_kfunc_arg_nullable(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__nullable"); } +static bool is_kfunc_arg_nonown_allowed(const struct btf *btf, const struct btf_param *arg) +{ + return btf_param_match_suffix(btf, arg, "__nonown_allowed"); +} + static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param *arg) { return btf_param_match_suffix(btf, arg, "__str"); @@ -11123,10 +10990,15 @@ enum special_kfunc_type { KF_bpf_list_push_front, KF_bpf_list_push_back_impl, KF_bpf_list_push_back, + KF_bpf_list_add, KF_bpf_list_pop_front, KF_bpf_list_pop_back, + KF_bpf_list_del, KF_bpf_list_front, KF_bpf_list_back, + KF_bpf_list_is_first, + KF_bpf_list_is_last, + KF_bpf_list_empty, KF_bpf_cast_to_kern_ctx, KF_bpf_rdonly_cast, KF_bpf_rcu_read_lock, @@ -11191,10 +11063,15 @@ BTF_ID(func, bpf_list_push_front_impl) BTF_ID(func, bpf_list_push_front) BTF_ID(func, bpf_list_push_back_impl) BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_list_add) BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_list_del) BTF_ID(func, bpf_list_front) BTF_ID(func, bpf_list_back) +BTF_ID(func, bpf_list_is_first) +BTF_ID(func, bpf_list_is_last) +BTF_ID(func, bpf_list_empty) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) @@ -11263,7 +11140,11 @@ BTF_ID(func, bpf_task_work_schedule_resume) BTF_ID(func, bpf_arena_alloc_pages) BTF_ID(func, bpf_arena_free_pages) BTF_ID(func, bpf_arena_reserve_pages) +#ifdef CONFIG_BPF_EVENTS BTF_ID(func, bpf_session_is_return) +#else +BTF_ID_UNUSED +#endif BTF_ID(func, bpf_stream_vprintk) BTF_ID(func, bpf_stream_print_stack) @@ -11302,7 +11183,8 @@ static bool is_bpf_list_push_kfunc(u32 func_id) return func_id == special_kfunc_list[KF_bpf_list_push_front] || func_id == special_kfunc_list[KF_bpf_list_push_front_impl] || func_id == special_kfunc_list[KF_bpf_list_push_back] || - func_id == special_kfunc_list[KF_bpf_list_push_back_impl]; + func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || + func_id == special_kfunc_list[KF_bpf_list_add]; } static bool is_bpf_rbtree_add_kfunc(u32 func_id) @@ -11351,15 +11233,12 @@ bool bpf_is_kfunc_pkt_changing(struct bpf_kfunc_call_arg_meta *meta) } static enum kfunc_ptr_arg_type -get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, - struct bpf_kfunc_call_arg_meta *meta, +get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, struct bpf_func_state *caller, + struct bpf_reg_state *regs, struct bpf_kfunc_call_arg_meta *meta, const struct btf_type *t, const struct btf_type *ref_t, const char *ref_tname, const struct btf_param *args, - int argno, int nargs) + int arg, int nargs, argno_t argno, struct bpf_reg_state *reg) { - u32 regno = argno + 1; - struct bpf_reg_state *regs = cur_regs(env); - struct bpf_reg_state *reg = ®s[regno]; bool arg_mem_size = false; if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || @@ -11367,9 +11246,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, meta->func_id == special_kfunc_list[KF_bpf_session_cookie]) return KF_ARG_PTR_TO_CTX; - if (argno + 1 < nargs && - (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]) || - is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))) + if (arg + 1 < nargs && + (is_kfunc_arg_mem_size(meta->btf, &args[arg + 1], get_func_arg_reg(caller, regs, arg + 1)) || + is_kfunc_arg_const_mem_size(meta->btf, &args[arg + 1], get_func_arg_reg(caller, regs, arg + 1)))) arg_mem_size = true; /* In this function, we verify the kfunc's BTF as per the argument type, @@ -11377,68 +11256,69 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, * type to our caller. When a set of conditions hold in the BTF type of * arguments, we resolve it to a known kfunc_ptr_arg_type. */ - if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) + if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), arg)) return KF_ARG_PTR_TO_CTX; - if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && bpf_register_is_null(reg) && + if (is_kfunc_arg_nullable(meta->btf, &args[arg]) && bpf_register_is_null(reg) && !arg_mem_size) return KF_ARG_PTR_TO_NULL; - if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) + if (is_kfunc_arg_alloc_obj(meta->btf, &args[arg])) return KF_ARG_PTR_TO_ALLOC_BTF_ID; - if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno])) + if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[arg])) return KF_ARG_PTR_TO_REFCOUNTED_KPTR; - if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) + if (is_kfunc_arg_dynptr(meta->btf, &args[arg])) return KF_ARG_PTR_TO_DYNPTR; - if (is_kfunc_arg_iter(meta, argno, &args[argno])) + if (is_kfunc_arg_iter(meta, arg, &args[arg])) return KF_ARG_PTR_TO_ITER; - if (is_kfunc_arg_list_head(meta->btf, &args[argno])) + if (is_kfunc_arg_list_head(meta->btf, &args[arg])) return KF_ARG_PTR_TO_LIST_HEAD; - if (is_kfunc_arg_list_node(meta->btf, &args[argno])) + if (is_kfunc_arg_list_node(meta->btf, &args[arg])) return KF_ARG_PTR_TO_LIST_NODE; - if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno])) + if (is_kfunc_arg_rbtree_root(meta->btf, &args[arg])) return KF_ARG_PTR_TO_RB_ROOT; - if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno])) + if (is_kfunc_arg_rbtree_node(meta->btf, &args[arg])) return KF_ARG_PTR_TO_RB_NODE; - if (is_kfunc_arg_const_str(meta->btf, &args[argno])) + if (is_kfunc_arg_const_str(meta->btf, &args[arg])) return KF_ARG_PTR_TO_CONST_STR; - if (is_kfunc_arg_map(meta->btf, &args[argno])) + if (is_kfunc_arg_map(meta->btf, &args[arg])) return KF_ARG_PTR_TO_MAP; - if (is_kfunc_arg_wq(meta->btf, &args[argno])) + if (is_kfunc_arg_wq(meta->btf, &args[arg])) return KF_ARG_PTR_TO_WORKQUEUE; - if (is_kfunc_arg_timer(meta->btf, &args[argno])) + if (is_kfunc_arg_timer(meta->btf, &args[arg])) return KF_ARG_PTR_TO_TIMER; - if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + if (is_kfunc_arg_task_work(meta->btf, &args[arg])) return KF_ARG_PTR_TO_TASK_WORK; - if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) + if (is_kfunc_arg_irq_flag(meta->btf, &args[arg])) return KF_ARG_PTR_TO_IRQ_FLAG; - if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) + if (is_kfunc_arg_res_spin_lock(meta->btf, &args[arg])) return KF_ARG_PTR_TO_RES_SPIN_LOCK; if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { - verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", - meta->func_name, argno, btf_type_str(ref_t), ref_tname); + verbose(env, "kernel function %s %s pointer type %s %s is not supported\n", + meta->func_name, reg_arg_name(env, argno), + btf_type_str(ref_t), ref_tname); return -EINVAL; } return KF_ARG_PTR_TO_BTF_ID; } - if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) + if (is_kfunc_arg_callback(env, meta->btf, &args[arg])) return KF_ARG_PTR_TO_CALLBACK; /* This is the catch all argument type of register types supported by @@ -11448,8 +11328,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, */ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) && (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { - verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", - argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); + verbose(env, "%s pointer type %s %s must point to %sscalar, or struct with scalar\n", + reg_arg_name(env, argno), + btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); return -EINVAL; } return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM; @@ -11460,7 +11341,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, const struct btf_type *ref_t, const char *ref_tname, u32 ref_id, struct bpf_kfunc_call_arg_meta *meta, - int argno) + int arg, argno_t argno) { const struct btf_type *reg_ref_t; bool strict_type_match = false; @@ -11502,7 +11383,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, * btf_struct_ids_match() to walk the struct at the 0th offset, and * resolve types. */ - if ((is_kfunc_release(meta) && reg->ref_obj_id) || + if ((is_kfunc_release(meta) && reg_is_referenced(env, reg)) || btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; @@ -11518,19 +11399,19 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, */ taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname); if (!taking_projection && !struct_same) { - verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", - meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1, + verbose(env, "kernel function %s %s expected pointer to %s %s but %s has a pointer to %s %s\n", + meta->func_name, reg_arg_name(env, argno), + btf_type_str(ref_t), ref_tname, reg_arg_name(env, argno), btf_type_str(reg_ref_t), reg_ref_tname); return -EINVAL; } return 0; } -static int process_irq_flag(struct bpf_verifier_env *env, int regno, +static int process_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - struct bpf_reg_state *reg = reg_state(env, regno); - int err, kfunc_class = IRQ_NATIVE_KFUNC; + int err, spi, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] || @@ -11550,11 +11431,13 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (irq_save) { if (!is_irq_flag_reg_valid_uninit(env, reg)) { - verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1); + verbose(env, "expected uninitialized irq flag as %s\n", + reg_arg_name(env, argno)); return -EINVAL; } - err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false); + err = check_mem_access(env, env->insn_idx, reg, argno, 0, BPF_DW, + BPF_WRITE, -1, false, false); if (err) return err; @@ -11564,13 +11447,16 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, } else { err = is_irq_flag_reg_valid_init(env, reg); if (err) { - verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1); + verbose(env, "expected an initialized irq flag as %s\n", + reg_arg_name(env, argno)); return err; } - err = mark_irq_flag_read(env, reg); - if (err) - return err; + spi = irq_flag_get_spi(env, reg); + if (spi < 0) + return spi; + + mark_stack_slots_scratched(env, spi, 1); err = unmark_stack_slot_irq_flag(env, reg, kfunc_class); if (err) @@ -11601,36 +11487,21 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state return 0; } -static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id) +static void ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 id) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_func_state *unused; struct bpf_reg_state *reg; - int i; - if (!ref_obj_id) { - verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion"); - return -EFAULT; - } + WARN_ON_ONCE(release_reference_nomark(env->cur_state, id)); - for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].id != ref_obj_id) - continue; - - /* Clear ref_obj_id here so release_reference doesn't clobber - * the whole reg - */ - bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ - if (reg->ref_obj_id == ref_obj_id) { - reg->ref_obj_id = 0; - ref_set_non_owning(env, reg); - } - })); - return 0; - } + bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ + if (reg->id == id) { + reg->id = 0; + ref_set_non_owning(env, reg); + } + })); - verifier_bug(env, "ref state missing for ref_obj_id"); - return -EFAULT; + return; } /* Implementation details: @@ -11711,8 +11582,12 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) return is_bpf_list_push_kfunc(btf_id) || btf_id == special_kfunc_list[KF_bpf_list_pop_front] || btf_id == special_kfunc_list[KF_bpf_list_pop_back] || + btf_id == special_kfunc_list[KF_bpf_list_del] || btf_id == special_kfunc_list[KF_bpf_list_front] || - btf_id == special_kfunc_list[KF_bpf_list_back]; + btf_id == special_kfunc_list[KF_bpf_list_back] || + btf_id == special_kfunc_list[KF_bpf_list_is_first] || + btf_id == special_kfunc_list[KF_bpf_list_is_last] || + btf_id == special_kfunc_list[KF_bpf_list_empty]; } static bool is_bpf_rbtree_api_kfunc(u32 btf_id) @@ -11778,7 +11653,7 @@ static bool is_async_callback_calling_kfunc(u32 btf_id) is_task_work_add_kfunc(btf_id); } -static bool is_bpf_throw_kfunc(struct bpf_insn *insn) +bool bpf_is_throw_kfunc(struct bpf_insn *insn) { return bpf_pseudo_kfunc_call(insn) && insn->off == 0 && insn->imm == special_kfunc_list[KF_bpf_throw]; @@ -11833,7 +11708,10 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, switch (node_field_type) { case BPF_LIST_NODE: - ret = is_bpf_list_push_kfunc(kfunc_btf_id); + ret = is_bpf_list_push_kfunc(kfunc_btf_id) || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_del] || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_first] || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_is_last]; break; case BPF_RB_NODE: ret = (is_bpf_rbtree_add_kfunc(kfunc_btf_id) || @@ -11855,7 +11733,7 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, static int __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta, enum btf_field_type head_field_type, struct btf_field **head_field) @@ -11876,8 +11754,8 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, head_type_name = btf_field_type_name(head_field_type); if (!tnum_is_const(reg->var_off)) { verbose(env, - "R%d doesn't have constant offset. %s has to be at the constant offset\n", - regno, head_type_name); + "%s doesn't have constant offset. %s has to be at the constant offset\n", + reg_arg_name(env, argno), head_type_name); return -EINVAL; } @@ -11905,24 +11783,24 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, } static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD, + return __process_kf_arg_ptr_to_graph_root(env, reg, argno, meta, BPF_LIST_HEAD, &meta->arg_list_head.field); } static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT, + return __process_kf_arg_ptr_to_graph_root(env, reg, argno, meta, BPF_RB_ROOT, &meta->arg_rbtree_root.field); } static int __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta, enum btf_field_type head_field_type, enum btf_field_type node_field_type, @@ -11944,8 +11822,8 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, node_type_name = btf_field_type_name(node_field_type); if (!tnum_is_const(reg->var_off)) { verbose(env, - "R%d doesn't have constant offset. %s has to be at the constant offset\n", - regno, node_type_name); + "%s doesn't have constant offset. %s has to be at the constant offset\n", + reg_arg_name(env, argno), node_type_name); return -EINVAL; } @@ -11986,19 +11864,19 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, } static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, + return __process_kf_arg_ptr_to_graph_node(env, reg, argno, meta, BPF_LIST_HEAD, BPF_LIST_NODE, &meta->arg_list_head.field); } static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, + struct bpf_reg_state *reg, argno_t argno, struct bpf_kfunc_call_arg_meta *meta) { - return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, + return __process_kf_arg_ptr_to_graph_node(env, reg, argno, meta, BPF_RB_ROOT, BPF_RB_NODE, &meta->arg_rbtree_root.field); } @@ -12029,6 +11907,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ int insn_idx) { const char *func_name = meta->func_name, *ref_tname; + struct bpf_func_state *caller = cur_func(env); + struct bpf_reg_state *regs = cur_regs(env); const struct btf *btf = meta->btf; const struct btf_param *args; struct btf_record *rec; @@ -12037,20 +11917,31 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ args = (const struct btf_param *)(meta->func_proto + 1); nargs = btf_type_vlen(meta->func_proto); - if (nargs > MAX_BPF_FUNC_REG_ARGS) { + if (nargs > MAX_BPF_FUNC_ARGS) { verbose(env, "Function %s has %d > %d args\n", func_name, nargs, - MAX_BPF_FUNC_REG_ARGS); + MAX_BPF_FUNC_ARGS); return -EINVAL; } + if (nargs > MAX_BPF_FUNC_REG_ARGS && !bpf_jit_supports_stack_args()) { + verbose(env, "JIT does not support kfunc %s() with %d args\n", + func_name, nargs); + return -ENOTSUPP; + } + + ret = check_outgoing_stack_args(env, caller, nargs); + if (ret) + return ret; /* Check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { - struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[i + 1]; + struct bpf_reg_state *reg = get_func_arg_reg(caller, regs, i); const struct btf_type *t, *ref_t, *resolve_ret; enum bpf_arg_type arg_type = ARG_DONTCARE; - u32 regno = i + 1, ref_id, type_size; + argno_t argno = argno_from_arg(i + 1); + int regno = reg_from_argno(argno); + u32 ref_id, type_size; bool is_ret_buf_sz = false; int kf_arg_type; @@ -12060,6 +11951,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc"); return -EFAULT; } + if (regno < 0) { + verbose(env, "%s prog->aux cannot be a stack argument\n", + reg_arg_name(env, argno)); + return -EINVAL; + } meta->arg_prog = true; cur_aux(env)->arg_prog = regno; continue; @@ -12072,7 +11968,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (btf_type_is_scalar(t)) { if (reg->type != SCALAR_VALUE) { - verbose(env, "R%d is not a scalar\n", regno); + verbose(env, "%s is not a scalar\n", reg_arg_name(env, argno)); return -EINVAL; } @@ -12082,10 +11978,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d must be a known constant\n", regno); + verbose(env, "%s must be a known constant\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = mark_chain_precision(env, regno); + if (regno >= 0) + ret = mark_chain_precision(env, regno); + else + ret = mark_stack_arg_precision(env, i); if (ret < 0) return ret; meta->arg_constant.found = true; @@ -12104,12 +12004,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d is not a const\n", regno); + verbose(env, "%s is not a const\n", + reg_arg_name(env, argno)); return -EINVAL; } meta->r0_size = reg->var_off.value; - ret = mark_chain_precision(env, regno); + if (regno >= 0) + ret = mark_chain_precision(env, regno); + else + ret = mark_stack_arg_precision(env, i); if (ret) return ret; } @@ -12117,32 +12021,33 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (!btf_type_is_ptr(t)) { - verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t)); + verbose(env, "Unrecognized %s type %s\n", + reg_arg_name(env, argno), btf_type_str(t)); return -EINVAL; } if ((bpf_register_is_null(reg) || type_may_be_null(reg->type)) && !is_kfunc_arg_nullable(meta->btf, &args[i])) { - verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); + verbose(env, "Possibly NULL pointer passed to trusted %s\n", + reg_arg_name(env, argno)); return -EACCES; } - if (reg->ref_obj_id) { - if (is_kfunc_release(meta) && meta->ref_obj_id) { - verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u", - regno, reg->ref_obj_id, - meta->ref_obj_id); - return -EFAULT; - } - meta->ref_obj_id = reg->ref_obj_id; - if (is_kfunc_release(meta)) - meta->release_regno = regno; + if (regno == meta->release_regno && !is_kfunc_arg_dynptr(meta->btf, &args[i]) && + !reg_is_referenced(env, reg) && !bpf_register_is_null(reg)) { + verbose(env, "release kfunc %s expects referenced PTR_TO_BTF_ID passed to %s\n", + func_name, reg_arg_name(env, argno)); + return -EINVAL; } + if (reg_is_referenced(env, reg)) + update_ref_obj(&meta->ref_obj, reg); + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs); + kf_arg_type = get_kfunc_ptr_arg_type(env, caller, regs, meta, t, ref_t, ref_tname, + args, i, nargs, argno, reg); if (kf_arg_type < 0) return kf_arg_type; @@ -12151,7 +12056,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ continue; case KF_ARG_PTR_TO_MAP: if (!reg->map_ptr) { - verbose(env, "pointer in R%d isn't map pointer\n", regno); + verbose(env, "pointer in %s isn't map pointer\n", + reg_arg_name(env, argno)); return -EINVAL; } if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || @@ -12187,18 +12093,19 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ fallthrough; case KF_ARG_PTR_TO_ALLOC_BTF_ID: case KF_ARG_PTR_TO_BTF_ID: - if (!is_trusted_reg(reg)) { + if (!is_trusted_reg(env, reg)) { if (!is_kfunc_rcu(meta)) { - verbose(env, "R%d must be referenced or trusted\n", regno); + verbose(env, "%s must be referenced or trusted\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!is_rcu_reg(reg)) { - verbose(env, "R%d must be a rcu pointer\n", regno); + verbose(env, "%s must be a rcu pointer\n", + reg_arg_name(env, argno)); return -EINVAL; } } fallthrough; - case KF_ARG_PTR_TO_DYNPTR: case KF_ARG_PTR_TO_ITER: case KF_ARG_PTR_TO_LIST_HEAD: case KF_ARG_PTR_TO_LIST_NODE: @@ -12215,6 +12122,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; + case KF_ARG_PTR_TO_DYNPTR: + arg_type = ARG_PTR_TO_DYNPTR; + break; case KF_ARG_PTR_TO_CTX: arg_type = ARG_PTR_TO_CTX; break; @@ -12223,17 +12133,17 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } - if (is_kfunc_release(meta) && reg->ref_obj_id) + if (regno == meta->release_regno) arg_type |= OBJ_RELEASE; - ret = check_func_arg_reg_off(env, reg, regno, arg_type); + ret = check_func_arg_reg_off(env, reg, argno, arg_type); if (ret < 0) return ret; switch (kf_arg_type) { case KF_ARG_PTR_TO_CTX: if (reg->type != PTR_TO_CTX) { - verbose(env, "arg#%d expected pointer to ctx, but got %s\n", - i, reg_type_str(env, reg->type)); + verbose(env, "%s expected pointer to ctx, but got %s\n", + reg_arg_name(env, argno), reg_type_str(env, reg->type)); return -EINVAL; } @@ -12247,19 +12157,22 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_ALLOC_BTF_ID: if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC)) { if (!is_bpf_obj_drop_kfunc(meta->func_id)) { - verbose(env, "arg#%d expected for bpf_obj_drop()\n", i); + verbose(env, "%s expected for bpf_obj_drop()\n", + reg_arg_name(env, argno)); return -EINVAL; } } else if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC | MEM_PERCPU)) { if (!is_bpf_percpu_obj_drop_kfunc(meta->func_id)) { - verbose(env, "arg#%d expected for bpf_percpu_obj_drop()\n", i); + verbose(env, "%s expected for bpf_percpu_obj_drop()\n", + reg_arg_name(env, argno)); return -EINVAL; } } else { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + verbose(env, "%s expected pointer to allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } - if (!reg->ref_obj_id) { + if (!reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } @@ -12271,10 +12184,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_DYNPTR: { enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR; - int clone_ref_obj_id = 0; - - if (reg->type == CONST_PTR_TO_DYNPTR) - dynptr_arg_type |= MEM_RDONLY; if (is_kfunc_arg_uninit(btf, &args[i])) dynptr_arg_type |= MEM_UNINIT; @@ -12288,11 +12197,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_file]) { dynptr_arg_type |= DYNPTR_TYPE_FILE; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_file_discard]) { - dynptr_arg_type |= DYNPTR_TYPE_FILE; - meta->release_regno = regno; + dynptr_arg_type |= DYNPTR_TYPE_FILE | OBJ_RELEASE; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { - enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; + enum bpf_dynptr_type parent_type = meta->dynptr.type; if (parent_type == BPF_DYNPTR_TYPE_INVALID) { verifier_bug(env, "no dynptr type for parent of clone"); @@ -12300,29 +12208,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type); - clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id; - if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) { - verifier_bug(env, "missing ref obj id for parent of clone"); - return -EFAULT; - } } - ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id); + ret = process_dynptr_func(env, reg, argno, insn_idx, dynptr_arg_type, + &meta->ref_obj, &meta->dynptr); if (ret < 0) return ret; - - if (!(dynptr_arg_type & MEM_UNINIT)) { - int id = dynptr_id(env, reg); - - if (id < 0) { - verifier_bug(env, "failed to obtain dynptr id"); - return id; - } - meta->initialized_dynptr.id = id; - meta->initialized_dynptr.type = dynptr_get_type(env, reg); - meta->initialized_dynptr.ref_obj_id = dynptr_ref_obj_id(env, reg); - } - break; } case KF_ARG_PTR_TO_ITER: @@ -12332,63 +12223,78 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } } - ret = process_iter_arg(env, regno, insn_idx, meta); + ret = process_iter_arg(env, reg, argno, insn_idx, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_LIST_HEAD: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); + verbose(env, "%s expected pointer to map value or allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } - if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && + !reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_list_head(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_RB_ROOT: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); + verbose(env, "%s expected pointer to map value or allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } - if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && + !reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_rbtree_root(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_LIST_NODE: + if (is_kfunc_arg_nonown_allowed(btf, &args[i]) && + type_is_non_owning_ref(reg->type) && !reg_is_referenced(env, reg)) { + /* Allow bpf_list_front/back return value for + * __nonown_allowed list-node arguments. + */ + goto check_ok; + } if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + verbose(env, "%s expected pointer to allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } - if (!reg->ref_obj_id) { + if (!reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } - ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta); +check_ok: + ret = process_kf_arg_ptr_to_list_node(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_RB_NODE: if (is_bpf_rbtree_add_kfunc(meta->func_id)) { if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d expected pointer to allocated object\n", i); + verbose(env, "%s expected pointer to allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } - if (!reg->ref_obj_id) { + if (!reg_is_referenced(env, reg)) { verbose(env, "allocated object must be referenced\n"); return -EINVAL; } } else { - if (!type_is_non_owning_ref(reg->type) && !reg->ref_obj_id) { + if (!type_is_non_owning_ref(reg->type) && + !reg_is_referenced(env, reg)) { verbose(env, "%s can only take non-owning or refcounted bpf_rb_node pointer\n", func_name); return -EINVAL; } @@ -12398,7 +12304,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } - ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta); + ret = process_kf_arg_ptr_to_rbtree_node(env, reg, argno, meta); if (ret < 0) return ret; break; @@ -12413,38 +12319,44 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if ((base_type(reg->type) != PTR_TO_BTF_ID || (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && !reg2btf_ids[base_type(reg->type)]) { - verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type)); + verbose(env, "%s is %s ", reg_arg_name(env, argno), + reg_type_str(env, reg->type)); verbose(env, "expected %s or socket\n", reg_type_str(env, base_type(reg->type) | (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); return -EINVAL; } - ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); + ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i, argno); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_MEM: resolve_ret = btf_resolve_size(btf, ref_t, &type_size); if (IS_ERR(resolve_ret)) { - verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n", - i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret)); + verbose(env, "%s reference type('%s %s') size cannot be determined: %ld\n", + reg_arg_name(env, argno), btf_type_str(ref_t), + ref_tname, PTR_ERR(resolve_ret)); return -EINVAL; } - ret = check_mem_reg(env, reg, regno, type_size); + ret = check_mem_reg(env, reg, argno, type_size); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_MEM_SIZE: { - struct bpf_reg_state *buff_reg = ®s[regno]; + struct bpf_reg_state *buff_reg = reg; const struct btf_param *buff_arg = &args[i]; - struct bpf_reg_state *size_reg = ®s[regno + 1]; + struct bpf_reg_state *size_reg = get_func_arg_reg(caller, regs, i + 1); const struct btf_param *size_arg = &args[i + 1]; + argno_t next_argno = argno_from_arg(i + 2); if (!bpf_register_is_null(buff_reg) || !is_kfunc_arg_nullable(meta->btf, buff_arg)) { - ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1); + ret = check_kfunc_mem_size_reg(env, buff_reg, size_reg, + argno, next_argno); if (ret < 0) { - verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); + verbose(env, "%s and ", reg_arg_name(env, argno)); + verbose(env, "%s memory, len pair leads to invalid memory access\n", + reg_arg_name(env, next_argno)); return ret; } } @@ -12455,7 +12367,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EFAULT; } if (!tnum_is_const(size_reg->var_off)) { - verbose(env, "R%d must be a known constant\n", regno + 1); + verbose(env, "%s must be a known constant\n", + reg_arg_name(env, next_argno)); return -EINVAL; } meta->arg_constant.found = true; @@ -12468,14 +12381,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } case KF_ARG_PTR_TO_CALLBACK: if (reg->type != PTR_TO_FUNC) { - verbose(env, "arg%d expected pointer to func\n", i); + verbose(env, "%s expected pointer to func\n", reg_arg_name(env, argno)); return -EINVAL; } meta->subprogno = reg->subprogno; break; case KF_ARG_PTR_TO_REFCOUNTED_KPTR: if (!type_is_ptr_alloc_obj(reg->type)) { - verbose(env, "arg#%d is neither owning or non-owning ref\n", i); + verbose(env, "%s is neither owning or non-owning ref\n", + reg_arg_name(env, argno)); return -EINVAL; } if (!type_is_non_owning_ref(reg->type)) @@ -12488,7 +12402,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } if (rec->refcount_off < 0) { - verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i); + verbose(env, "%s doesn't point to a type with bpf_refcount field\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -12497,46 +12412,51 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_CONST_STR: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a const string\n", i); + verbose(env, "%s doesn't point to a const string\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = check_reg_const_str(env, reg, regno); + ret = check_arg_const_str(env, reg, argno); if (ret) return ret; break; case KF_ARG_PTR_TO_WORKQUEUE: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a map value\n", i); + verbose(env, "%s doesn't point to a map value\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = check_map_field_pointer(env, regno, BPF_WORKQUEUE, &meta->map); + ret = check_map_field_pointer(env, reg, argno, BPF_WORKQUEUE, &meta->map); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_TIMER: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a map value\n", i); + verbose(env, "%s doesn't point to a map value\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = process_timer_kfunc(env, regno, meta); + ret = process_timer_kfunc(env, reg, argno, meta); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_TASK_WORK: if (reg->type != PTR_TO_MAP_VALUE) { - verbose(env, "arg#%d doesn't point to a map value\n", i); + verbose(env, "%s doesn't point to a map value\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = check_map_field_pointer(env, regno, BPF_TASK_WORK, &meta->map); + ret = check_map_field_pointer(env, reg, argno, BPF_TASK_WORK, &meta->map); if (ret < 0) return ret; break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) { - verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); + verbose(env, "%s doesn't point to an irq flag on stack\n", + reg_arg_name(env, argno)); return -EINVAL; } - ret = process_irq_flag(env, regno, meta); + ret = process_irq_flag(env, reg, argno, meta); if (ret < 0) return ret; break; @@ -12545,7 +12465,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ int flags = PROCESS_RES_LOCK; if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { - verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); + verbose(env, "%s doesn't point to map value or allocated object\n", + reg_arg_name(env, argno)); return -EINVAL; } @@ -12557,7 +12478,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) flags |= PROCESS_LOCK_IRQ; - ret = process_spin_lock(env, regno, flags); + ret = process_spin_lock(env, reg, argno, flags); if (ret < 0) return ret; break; @@ -12565,12 +12486,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ } } - if (is_kfunc_release(meta) && !meta->release_regno) { - verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", - func_name); - return -EINVAL; - } - return 0; } @@ -12597,6 +12512,10 @@ int bpf_fetch_kfunc_arg_meta(struct bpf_verifier_env *env, meta->kfunc_flags = *kfunc.flags; + /* Only support release referenced argument passed by register */ + if (is_kfunc_release(meta)) + meta->release_regno = BPF_REG_1; + return 0; } @@ -12926,7 +12845,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] || meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { - enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type); + enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->dynptr.type); mark_reg_known_zero(env, regs, BPF_REG_0); @@ -12950,16 +12869,11 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } } - if (!meta->initialized_dynptr.id) { + if (!meta->dynptr.id) { verifier_bug(env, "no dynptr id"); return -EFAULT; } - regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id; - - /* we don't need to set BPF_REG_0's ref obj id - * because packet slices are not refcounted (see - * dynptr_type_refcounted) - */ + regs[BPF_REG_0].parent_id = meta->dynptr.id; } else { return 0; } @@ -12968,14 +12882,12 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name); -static int process_bpf_exit_full(struct bpf_verifier_env *env, - bool *do_print_state, bool exception_exit); static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable; - u32 i, nargs, ptr_type_id, release_ref_obj_id; + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; const struct btf_type *t, *ptr_type; @@ -12983,7 +12895,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *insn_aux; int err, insn_idx = *insn_idx_p; const struct btf_param *args; + u32 i, nargs, ptr_type_id; struct btf *desc_btf; + int id; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) @@ -13050,6 +12964,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (err < 0) return err; + if ((is_bpf_obj_drop_kfunc(meta.func_id) || + is_bpf_percpu_obj_drop_kfunc(meta.func_id)) && (is_tracing_prog_type(prog_type) || + /* is_tracing_prog_type() for now doesn't cover non-iterator tracing progs. */ + (prog_type == BPF_PROG_TYPE_TRACING && env->prog->expected_attach_type != BPF_TRACE_ITER + && !env->prog->sleepable))) { + struct btf_struct_meta *struct_meta; + + struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); + if (struct_meta && btf_record_has_nmi_unsafe_fields(struct_meta->record)) { + verbose(env, "%s cannot be used in tracing programs on types with NMI unsafe fields\n", + func_name); + return -EINVAL; + } + } + if (is_bpf_rbtree_add_kfunc(meta.func_id)) { err = push_callback_call(env, insn, insn_idx, meta.subprogno, set_rbtree_add_callback_state); @@ -13094,22 +13023,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (rcu_lock) { env->cur_state->active_rcu_locks++; } else if (rcu_unlock) { - struct bpf_func_state *state; - struct bpf_reg_state *reg; - u32 clear_mask = (1 << STACK_SPILL) | (1 << STACK_ITER); - if (env->cur_state->active_rcu_locks == 0) { verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); return -EINVAL; } - if (--env->cur_state->active_rcu_locks == 0) { - bpf_for_each_reg_in_vstate_mask(env->cur_state, state, reg, clear_mask, ({ - if (reg->type & MEM_RCU) { - reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); - reg->type |= PTR_UNTRUSTED; - } - })); - } + if (--env->cur_state->active_rcu_locks == 0) + invalidate_rcu_protected_refs(env); } else if (preempt_disable) { env->cur_state->active_preempt_locks++; } else if (preempt_enable) { @@ -13140,37 +13059,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ if (meta.release_regno) { - struct bpf_reg_state *reg = ®s[meta.release_regno]; - - if (meta.initialized_dynptr.ref_obj_id) { - err = unmark_stack_slots_dynptr(env, reg); - } else { - err = release_reference(env, reg->ref_obj_id); - if (err) - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); - } + err = release_reg(env, ®s[meta.release_regno], false, !!meta.dynptr.id); if (err) return err; } if (is_bpf_list_push_kfunc(meta.func_id) || is_bpf_rbtree_add_kfunc(meta.func_id)) { - release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; + id = regs[BPF_REG_2].id; insn_aux->insert_off = regs[BPF_REG_2].var_off.value; insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); - err = ref_convert_owning_non_owning(env, release_ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", - func_name, meta.func_id); - return err; - } - - err = release_reference(env, release_ref_obj_id); - if (err) { - verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, meta.func_id); - return err; - } + ref_convert_owning_non_owning(env, id); } if (meta.func_id == special_kfunc_list[KF_bpf_throw]) { @@ -13197,6 +13095,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, bpf_mark_reg_not_init(env, ®s[regno]); regs[regno].subreg_def = DEF_NOT_SUBREG; } + invalidate_outgoing_stack_args(env, cur_func(env)); /* Check return type */ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); @@ -13254,8 +13153,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type |= MEM_RDONLY; /* Ensures we don't access the memory after a release_reference() */ - if (meta.ref_obj_id) - regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + if (meta.ref_obj.id) { + err = validate_ref_obj(env, &meta.ref_obj); + if (err) + return err; + regs[BPF_REG_0].parent_id = meta.ref_obj.id; + } if (is_kfunc_rcu_protected(&meta)) regs[BPF_REG_0].type |= MEM_RCU; @@ -13301,13 +13204,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); if (is_kfunc_acquire(&meta)) { - int id = acquire_reference(env, insn_idx); - + id = acquire_reference(env, insn_idx, 0); if (id < 0) return id; - if (is_kfunc_ret_null(&meta)) - regs[BPF_REG_0].id = id; - regs[BPF_REG_0].ref_obj_id = id; + regs[BPF_REG_0].id = id; } else if (is_rbtree_node_type(ptr_type) || is_list_node_type(ptr_type)) { ref_set_non_owning(env, ®s[BPF_REG_0]); } @@ -13329,8 +13229,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, clear_all_pkt_pointers(env); nargs = btf_type_vlen(meta.func_proto); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + struct bpf_func_state *caller = cur_func(env); + struct bpf_subprog_info *caller_info = &env->subprog_info[caller->subprogno]; + u16 out_stack_arg_cnt = nargs - MAX_BPF_FUNC_REG_ARGS; + u16 stack_arg_cnt = bpf_in_stack_arg_cnt(caller_info) + out_stack_arg_cnt; + + if (stack_arg_cnt > caller_info->stack_arg_cnt) + caller_info->stack_arg_cnt = stack_arg_cnt; + } + args = (const struct btf_param *)(meta.func_proto + 1); - for (i = 0; i < nargs; i++) { + for (i = 0; i < min_t(int, nargs, MAX_BPF_FUNC_REG_ARGS); i++) { u32 regno = i + 1; t = btf_type_skip_modifiers(desc_btf, args[i].type, NULL); @@ -13350,7 +13260,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (meta.func_id == special_kfunc_list[KF_bpf_session_cookie]) env->prog->call_session_cookie = true; - if (is_bpf_throw_kfunc(insn)) + if (bpf_is_throw_kfunc(insn)) return process_bpf_exit_full(env, NULL, true); return 0; @@ -13362,7 +13272,7 @@ static bool check_reg_sane_offset_scalar(struct bpf_verifier_env *env, { bool known = tnum_is_const(reg->var_off); s64 val = reg->var_off.value; - s64 smin = reg->smin_value; + s64 smin = reg_smin(reg); if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "math between %s pointer and %lld is not allowed\n", @@ -13391,7 +13301,7 @@ static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env, { bool known = tnum_is_const(reg->var_off); s64 val = reg->var_off.value; - s64 smin = reg->smin_value; + s64 smin = reg_smin(reg); if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "%s pointer offset %lld is not allowed\n", @@ -13433,7 +13343,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, break; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; - ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value; + ptr_limit = mask_to_left ? reg_smin(ptr_reg) : reg_umax(ptr_reg); break; default: return REASON_TYPE; @@ -13522,7 +13432,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux; struct bpf_verifier_state *vstate = env->cur_state; bool off_is_imm = tnum_is_const(off_reg->var_off); - bool off_is_neg = off_reg->smin_value < 0; + bool off_is_neg = reg_smin(off_reg) < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); u32 alu_state, alu_limit; @@ -13541,7 +13451,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, if (!commit_window) { if (!tnum_is_const(off_reg->var_off) && - (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) + (reg_smin(off_reg) < 0) != (reg_smax(off_reg) < 0)) return REASON_BOUNDS; info->mask_to_left = (opcode == BPF_ADD && off_is_neg) || @@ -13597,7 +13507,7 @@ do_sim: */ if (!ptr_is_dst_reg) { tmp = *dst_reg; - copy_register_state(dst_reg, ptr_reg); + *dst_reg = *ptr_reg; } err = sanitize_speculative_path(env, NULL, env->insn_idx + 1, env->insn_idx); if (err < 0) @@ -13691,7 +13601,7 @@ static int check_stack_access_for_ptr_arithmetic( static int sanitize_check_bounds(struct bpf_verifier_env *env, const struct bpf_insn *insn, - const struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg) { u32 dst = insn->dst_reg; @@ -13708,7 +13618,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) { + if (check_map_access(env, dst_reg, argno_from_reg(dst), 0, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; @@ -13735,10 +13645,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); - s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, - smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; - u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, - umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; + s64 smin_val = reg_smin(off_reg), smax_val = reg_smax(off_reg); + u64 umin_val = reg_umin(off_reg), umax_val = reg_umax(off_reg); struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; @@ -13840,16 +13748,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * added into the variable offset, and we copy the fixed offset * from ptr_reg. */ - if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) || - check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } - if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) || - check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } + dst_reg->r64 = cnum64_add(ptr_reg->r64, off_reg->r64); dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { @@ -13881,24 +13780,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst); return -EACCES; } - /* A new variable offset is created. If the subtrahend is known - * nonnegative, then any reg->range we had before is still good. - */ - if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) || - check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) { - /* Overflow possible, we know nothing */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } - if (umin_ptr < umax_val) { - /* Overflow possible, we know nothing */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->umin_value = umin_ptr - umax_val; - dst_reg->umax_value = umax_ptr - umin_val; - } + dst_reg->r64 = cnum64_add(ptr_reg->r64, cnum64_negate(off_reg->r64)); dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->raw = ptr_reg->raw; if (reg_is_pkt_pointer(ptr_reg)) { @@ -13955,227 +13837,123 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; - u32 umax_val = src_reg->u32_max_value; - bool min_overflow, max_overflow; - - if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) || - check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) { - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; - } - - /* If either all additions overflow or no additions overflow, then - * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = - * dst_umax + src_umax. Otherwise (some additions overflow), set - * the output bounds to unbounded. - */ - min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); - max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); - - if (!min_overflow && max_overflow) { - *dst_umin = 0; - *dst_umax = U32_MAX; - } + dst_reg->r32 = cnum32_add(dst_reg->r32, src_reg->r32); } static void scalar_min_max_add(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 umin_val = src_reg->umin_value; - u64 umax_val = src_reg->umax_value; - bool min_overflow, max_overflow; - - if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) || - check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) { - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; - } - - /* If either all additions overflow or no additions overflow, then - * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = - * dst_umax + src_umax. Otherwise (some additions overflow), set - * the output bounds to unbounded. - */ - min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); - max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); - - if (!min_overflow && max_overflow) { - *dst_umin = 0; - *dst_umax = U64_MAX; - } + dst_reg->r64 = cnum64_add(dst_reg->r64, src_reg->r64); } static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; - u32 umax_val = src_reg->u32_max_value; - bool min_underflow, max_underflow; - - if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) || - check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) { - /* Overflow possible, we know nothing */ - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; - } - - /* If either all subtractions underflow or no subtractions - * underflow, it is okay to set: dst_umin = dst_umin - src_umax, - * dst_umax = dst_umax - src_umin. Otherwise (some subtractions - * underflow), set the output bounds to unbounded. - */ - min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); - max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); - - if (min_underflow && !max_underflow) { - *dst_umin = 0; - *dst_umax = U32_MAX; - } + dst_reg->r32 = cnum32_add(dst_reg->r32, cnum32_negate(src_reg->r32)); } static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 umin_val = src_reg->umin_value; - u64 umax_val = src_reg->umax_value; - bool min_underflow, max_underflow; - - if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) || - check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) { - /* Overflow possible, we know nothing */ - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; - } - - /* If either all subtractions underflow or no subtractions - * underflow, it is okay to set: dst_umin = dst_umin - src_umax, - * dst_umax = dst_umax - src_umin. Otherwise (some subtractions - * underflow), set the output bounds to unbounded. - */ - min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); - max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); - - if (min_underflow && !max_underflow) { - *dst_umin = 0; - *dst_umax = U64_MAX; - } + dst_reg->r64 = cnum64_add(dst_reg->r64, cnum64_negate(src_reg->r64)); } static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; + s32 smin = reg_s32_min(dst_reg); + s32 smax = reg_s32_max(dst_reg); + u32 umin = reg_u32_min(dst_reg); + u32 umax = reg_u32_max(dst_reg); s32 tmp_prod[4]; - if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) || - check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) { + if (check_mul_overflow(umax, reg_u32_max(src_reg), &umax) || + check_mul_overflow(umin, reg_u32_min(src_reg), &umin)) { /* Overflow possible, we know nothing */ - *dst_umin = 0; - *dst_umax = U32_MAX; + umin = 0; + umax = U32_MAX; } - if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) || - check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) || - check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) || - check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) { + if (check_mul_overflow(smin, reg_s32_min(src_reg), &tmp_prod[0]) || + check_mul_overflow(smin, reg_s32_max(src_reg), &tmp_prod[1]) || + check_mul_overflow(smax, reg_s32_min(src_reg), &tmp_prod[2]) || + check_mul_overflow(smax, reg_s32_max(src_reg), &tmp_prod[3])) { /* Overflow possible, we know nothing */ - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; + smin = S32_MIN; + smax = S32_MAX; } else { - *dst_smin = min_array(tmp_prod, 4); - *dst_smax = max_array(tmp_prod, 4); + smin = min_array(tmp_prod, 4); + smax = max_array(tmp_prod, 4); } + + dst_reg->r32 = cnum32_intersect(cnum32_from_urange(umin, umax), + cnum32_from_srange(smin, smax)); } static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; + s64 smin = reg_smin(dst_reg); + s64 smax = reg_smax(dst_reg); + u64 umin = reg_umin(dst_reg); + u64 umax = reg_umax(dst_reg); s64 tmp_prod[4]; - if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) || - check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) { + if (check_mul_overflow(umax, reg_umax(src_reg), &umax) || + check_mul_overflow(umin, reg_umin(src_reg), &umin)) { /* Overflow possible, we know nothing */ - *dst_umin = 0; - *dst_umax = U64_MAX; + umin = 0; + umax = U64_MAX; } - if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) || - check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) || - check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) || - check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) { + if (check_mul_overflow(smin, reg_smin(src_reg), &tmp_prod[0]) || + check_mul_overflow(smin, reg_smax(src_reg), &tmp_prod[1]) || + check_mul_overflow(smax, reg_smin(src_reg), &tmp_prod[2]) || + check_mul_overflow(smax, reg_smax(src_reg), &tmp_prod[3])) { /* Overflow possible, we know nothing */ - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; + smin = S64_MIN; + smax = S64_MAX; } else { - *dst_smin = min_array(tmp_prod, 4); - *dst_smax = max_array(tmp_prod, 4); + smin = min_array(tmp_prod, 4); + smax = max_array(tmp_prod, 4); } + + dst_reg->r64 = cnum64_intersect(cnum64_from_urange(umin, umax), + cnum64_from_srange(smin, smax)); } static void scalar32_min_max_udiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 src_val = reg_u32_min(src_reg); /* non-zero, const divisor */ - *dst_umin = *dst_umin / src_val; - *dst_umax = *dst_umax / src_val; + reg_set_urange32(dst_reg, reg_u32_min(dst_reg) / src_val, + reg_u32_max(dst_reg) / src_val); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_udiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 src_val = reg_umin(src_reg); /* non-zero, const divisor */ - *dst_umin = div64_u64(*dst_umin, src_val); - *dst_umax = div64_u64(*dst_umax, src_val); + reg_set_urange64(dst_reg, div64_u64(reg_umin(dst_reg), src_val), + div64_u64(reg_umax(dst_reg), src_val)); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; reset_reg32_and_tnum(dst_reg); } static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 smin = reg_s32_min(dst_reg); + s32 smax = reg_s32_max(dst_reg); + s32 src_val = reg_s32_min(src_reg); /* non-zero, const divisor */ s32 res1, res2; /* BPF div specification: S32_MIN / -1 = S32_MIN */ - if (*dst_smin == S32_MIN && src_val == -1) { + if (smin == S32_MIN && src_val == -1) { /* * If the dividend range contains more than just S32_MIN, * we cannot precisely track the result, so it becomes unbounded. @@ -14184,35 +13962,34 @@ static void scalar32_min_max_sdiv(struct bpf_reg_state *dst_reg, * = {S32_MIN} U [S32_MAX-9, S32_MAX] = [S32_MIN, S32_MAX] * Otherwise (if dividend is exactly S32_MIN), result remains S32_MIN. */ - if (*dst_smax != S32_MIN) { - *dst_smin = S32_MIN; - *dst_smax = S32_MAX; + if (smax != S32_MIN) { + smin = S32_MIN; + smax = S32_MAX; } goto reset; } - res1 = *dst_smin / src_val; - res2 = *dst_smax / src_val; - *dst_smin = min(res1, res2); - *dst_smax = max(res1, res2); + res1 = smin / src_val; + res2 = smax / src_val; + smin = min(res1, res2); + smax = max(res1, res2); reset: + reg_set_srange32(dst_reg, smin, smax); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 smin = reg_smin(dst_reg); + s64 smax = reg_smax(dst_reg); + s64 src_val = reg_smin(src_reg); /* non-zero, const divisor */ s64 res1, res2; /* BPF div specification: S64_MIN / -1 = S64_MIN */ - if (*dst_smin == S64_MIN && src_val == -1) { + if (smin == S64_MIN && src_val == -1) { /* * If the dividend range contains more than just S64_MIN, * we cannot precisely track the result, so it becomes unbounded. @@ -14221,79 +13998,66 @@ static void scalar_min_max_sdiv(struct bpf_reg_state *dst_reg, * = {S64_MIN} U [S64_MAX-9, S64_MAX] = [S64_MIN, S64_MAX] * Otherwise (if dividend is exactly S64_MIN), result remains S64_MIN. */ - if (*dst_smax != S64_MIN) { - *dst_smin = S64_MIN; - *dst_smax = S64_MAX; + if (smax != S64_MIN) { + smin = S64_MIN; + smax = S64_MAX; } goto reset; } - res1 = div64_s64(*dst_smin, src_val); - res2 = div64_s64(*dst_smax, src_val); - *dst_smin = min(res1, res2); - *dst_smax = max(res1, res2); + res1 = div64_s64(smin, src_val); + res2 = div64_s64(smax, src_val); + smin = min(res1, res2); + smax = max(res1, res2); reset: + reg_set_srange64(dst_reg, smin, smax); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; reset_reg32_and_tnum(dst_reg); } static void scalar32_min_max_umod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u32 *dst_umin = &dst_reg->u32_min_value; - u32 *dst_umax = &dst_reg->u32_max_value; - u32 src_val = src_reg->u32_min_value; /* non-zero, const divisor */ + u32 src_val = reg_u32_min(src_reg); /* non-zero, const divisor */ u32 res_max = src_val - 1; /* * If dst_umax <= res_max, the result remains unchanged. * e.g., [2, 5] % 10 = [2, 5]. */ - if (*dst_umax <= res_max) + if (reg_u32_max(dst_reg) <= res_max) return; - *dst_umin = 0; - *dst_umax = min(*dst_umax, res_max); + reg_set_urange32(dst_reg, 0, min(reg_u32_max(dst_reg), res_max)); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_umod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 *dst_umin = &dst_reg->umin_value; - u64 *dst_umax = &dst_reg->umax_value; - u64 src_val = src_reg->umin_value; /* non-zero, const divisor */ + u64 src_val = reg_umin(src_reg); /* non-zero, const divisor */ u64 res_max = src_val - 1; /* * If dst_umax <= res_max, the result remains unchanged. * e.g., [2, 5] % 10 = [2, 5]. */ - if (*dst_umax <= res_max) + if (reg_umax(dst_reg) <= res_max) return; - *dst_umin = 0; - *dst_umax = min(*dst_umax, res_max); + reg_set_urange64(dst_reg, 0, min(reg_umax(dst_reg), res_max)); /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; reset_reg32_and_tnum(dst_reg); } static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 *dst_smin = &dst_reg->s32_min_value; - s32 *dst_smax = &dst_reg->s32_max_value; - s32 src_val = src_reg->s32_min_value; /* non-zero, const divisor */ + s32 src_val = reg_s32_min(src_reg); /* non-zero, const divisor */ /* * Safe absolute value calculation: @@ -14313,33 +14077,26 @@ static void scalar32_min_max_smod(struct bpf_reg_state *dst_reg, * If the dividend is already within the result range, * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. */ - if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + if (reg_s32_min(dst_reg) >= -res_max_abs && reg_s32_max(dst_reg) <= res_max_abs) return; /* General case: result has the same sign as the dividend. */ - if (*dst_smin >= 0) { - *dst_smin = 0; - *dst_smax = min(*dst_smax, res_max_abs); - } else if (*dst_smax <= 0) { - *dst_smax = 0; - *dst_smin = max(*dst_smin, -res_max_abs); + if (reg_s32_min(dst_reg) >= 0) { + reg_set_srange32(dst_reg, 0, min(reg_s32_max(dst_reg), res_max_abs)); + } else if (reg_s32_max(dst_reg) <= 0) { + reg_set_srange32(dst_reg, max(reg_s32_min(dst_reg), -res_max_abs), 0); } else { - *dst_smin = -res_max_abs; - *dst_smax = res_max_abs; + reg_set_srange32(dst_reg, -res_max_abs, res_max_abs); } /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; reset_reg64_and_tnum(dst_reg); } static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 *dst_smin = &dst_reg->smin_value; - s64 *dst_smax = &dst_reg->smax_value; - s64 src_val = src_reg->smin_value; /* non-zero, const divisor */ + s64 src_val = reg_smin(src_reg); /* non-zero, const divisor */ /* * Safe absolute value calculation: @@ -14359,24 +14116,19 @@ static void scalar_min_max_smod(struct bpf_reg_state *dst_reg, * If the dividend is already within the result range, * the result remains unchanged. e.g., [-2, 5] % 10 = [-2, 5]. */ - if (*dst_smin >= -res_max_abs && *dst_smax <= res_max_abs) + if (reg_smin(dst_reg) >= -res_max_abs && reg_smax(dst_reg) <= res_max_abs) return; /* General case: result has the same sign as the dividend. */ - if (*dst_smin >= 0) { - *dst_smin = 0; - *dst_smax = min(*dst_smax, res_max_abs); - } else if (*dst_smax <= 0) { - *dst_smax = 0; - *dst_smin = max(*dst_smin, -res_max_abs); + if (reg_smin(dst_reg) >= 0) { + reg_set_srange64(dst_reg, 0, min(reg_smax(dst_reg), res_max_abs)); + } else if (reg_smax(dst_reg) <= 0) { + reg_set_srange64(dst_reg, max(reg_smin(dst_reg), -res_max_abs), 0); } else { - *dst_smin = -res_max_abs; - *dst_smax = res_max_abs; + reg_set_srange64(dst_reg, -res_max_abs, res_max_abs); } /* Reset other ranges/tnum to unbounded/unknown. */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; reset_reg32_and_tnum(dst_reg); } @@ -14386,7 +14138,7 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - u32 umax_val = src_reg->u32_max_value; + u32 umax_val = reg_u32_max(src_reg); if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -14396,19 +14148,9 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. */ - dst_reg->u32_min_value = var32_off.value; - dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val); - - /* Safe to set s32 bounds by casting u32 result into s32 when u32 - * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. - */ - if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; - } else { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } + reg_set_urange32(dst_reg, + var32_off.value, + min(reg_u32_max(dst_reg), umax_val)); } static void scalar_min_max_and(struct bpf_reg_state *dst_reg, @@ -14416,7 +14158,7 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - u64 umax_val = src_reg->umax_value; + u64 umax_val = reg_umax(src_reg); if (src_known && dst_known) { __mark_reg_known(dst_reg, dst_reg->var_off.value); @@ -14426,19 +14168,10 @@ static void scalar_min_max_and(struct bpf_reg_state *dst_reg, /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. */ - dst_reg->umin_value = dst_reg->var_off.value; - dst_reg->umax_value = min(dst_reg->umax_value, umax_val); + reg_set_urange64(dst_reg, + dst_reg->var_off.value, + min(reg_umax(dst_reg), umax_val)); - /* Safe to set s64 bounds by casting u64 result into s64 when u64 - * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. - */ - if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } else { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); } @@ -14449,7 +14182,7 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, bool src_known = tnum_subreg_is_const(src_reg->var_off); bool dst_known = tnum_subreg_is_const(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off); - u32 umin_val = src_reg->u32_min_value; + u32 umin_val = reg_u32_min(src_reg); if (src_known && dst_known) { __mark_reg32_known(dst_reg, var32_off.value); @@ -14459,19 +14192,9 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ - dst_reg->u32_min_value = max(dst_reg->u32_min_value, umin_val); - dst_reg->u32_max_value = var32_off.value | var32_off.mask; - - /* Safe to set s32 bounds by casting u32 result into s32 when u32 - * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. - */ - if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; - } else { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } + reg_set_urange32(dst_reg, + max(reg_u32_min(dst_reg), umin_val), + var32_off.value | var32_off.mask); } static void scalar_min_max_or(struct bpf_reg_state *dst_reg, @@ -14479,7 +14202,7 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, { bool src_known = tnum_is_const(src_reg->var_off); bool dst_known = tnum_is_const(dst_reg->var_off); - u64 umin_val = src_reg->umin_value; + u64 umin_val = reg_umin(src_reg); if (src_known && dst_known) { __mark_reg_known(dst_reg, dst_reg->var_off.value); @@ -14489,19 +14212,10 @@ static void scalar_min_max_or(struct bpf_reg_state *dst_reg, /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ - dst_reg->umin_value = max(dst_reg->umin_value, umin_val); - dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; + reg_set_urange64(dst_reg, + max(reg_umin(dst_reg), umin_val), + dst_reg->var_off.value | dst_reg->var_off.mask); - /* Safe to set s64 bounds by casting u64 result into s64 when u64 - * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. - */ - if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } else { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); } @@ -14519,19 +14233,7 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, } /* We get both minimum and maximum from the var32_off. */ - dst_reg->u32_min_value = var32_off.value; - dst_reg->u32_max_value = var32_off.value | var32_off.mask; - - /* Safe to set s32 bounds by casting u32 result into s32 when u32 - * doesn't cross sign boundary. Otherwise set s32 bounds to unbounded. - */ - if ((s32)dst_reg->u32_min_value <= (s32)dst_reg->u32_max_value) { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; - } else { - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; - } + reg_set_urange32(dst_reg, var32_off.value, var32_off.value | var32_off.mask); } static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, @@ -14547,46 +14249,30 @@ static void scalar_min_max_xor(struct bpf_reg_state *dst_reg, } /* We get both minimum and maximum from the var_off. */ - dst_reg->umin_value = dst_reg->var_off.value; - dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; - - /* Safe to set s64 bounds by casting u64 result into s64 when u64 - * doesn't cross sign boundary. Otherwise set s64 bounds to unbounded. - */ - if ((s64)dst_reg->umin_value <= (s64)dst_reg->umax_value) { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; - } else { - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } - - __update_reg_bounds(dst_reg); + reg_set_urange64(dst_reg, + dst_reg->var_off.value, + dst_reg->var_off.value | dst_reg->var_off.mask); } static void __scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { - /* We lose all sign bit information (except what we can pick - * up from var_off) - */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; /* If we might shift our top bit out, then we know nothing */ - if (umax_val > 31 || dst_reg->u32_max_value > 1ULL << (31 - umax_val)) { - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; - } else { - dst_reg->u32_min_value <<= umin_val; - dst_reg->u32_max_value <<= umax_val; - } + if (umax_val > 31 || reg_u32_max(dst_reg) > 1ULL << (31 - umax_val)) + reg_set_urange32(dst_reg, 0, U32_MAX); + else + /* We lose all sign bit information (except what we can pick + * up from var_off) + */ + reg_set_urange32(dst_reg, reg_u32_min(dst_reg) << umin_val, + reg_u32_max(dst_reg) << umax_val); } static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u32 umax_val = src_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; + u32 umax_val = reg_u32_max(src_reg); + u32 umin_val = reg_u32_min(src_reg); /* u32 alu operation will zext upper bits */ struct tnum subreg = tnum_subreg(dst_reg->var_off); @@ -14603,34 +14289,34 @@ static void scalar32_min_max_lsh(struct bpf_reg_state *dst_reg, static void __scalar64_min_max_lsh(struct bpf_reg_state *dst_reg, u64 umin_val, u64 umax_val) { + struct cnum64 u, s; + /* Special case <<32 because it is a common compiler pattern to sign * extend subreg by doing <<32 s>>32. smin/smax assignments are correct * because s32 bounds don't flip sign when shifting to the left by * 32bits. */ - if (umin_val == 32 && umax_val == 32) { - dst_reg->smax_value = (s64)dst_reg->s32_max_value << 32; - dst_reg->smin_value = (s64)dst_reg->s32_min_value << 32; - } else { - dst_reg->smax_value = S64_MAX; - dst_reg->smin_value = S64_MIN; - } + if (umin_val == 32 && umax_val == 32) + s = cnum64_from_srange((s64)reg_s32_min(dst_reg) << 32, + (s64)reg_s32_max(dst_reg) << 32); + else + s = CNUM64_UNBOUNDED; /* If we might shift our top bit out, then we know nothing */ - if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - dst_reg->umin_value <<= umin_val; - dst_reg->umax_value <<= umax_val; - } + if (reg_umax(dst_reg) > 1ULL << (63 - umax_val)) + u = CNUM64_UNBOUNDED; + else + u = cnum64_from_urange(reg_umin(dst_reg) << umin_val, + reg_umax(dst_reg) << umax_val); + + dst_reg->r64 = cnum64_intersect(u, s); } static void scalar_min_max_lsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umax_val = src_reg->umax_value; - u64 umin_val = src_reg->umin_value; + u64 umax_val = reg_umax(src_reg); + u64 umin_val = reg_umin(src_reg); /* scalar64 calc uses 32bit unshifted bounds so must be called first */ __scalar64_min_max_lsh(dst_reg, umin_val, umax_val); @@ -14645,8 +14331,8 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { struct tnum subreg = tnum_subreg(dst_reg->var_off); - u32 umax_val = src_reg->u32_max_value; - u32 umin_val = src_reg->u32_min_value; + u32 umax_val = reg_u32_max(src_reg); + u32 umin_val = reg_u32_min(src_reg); /* BPF_RSH is an unsigned shift. If the value in dst_reg might * be negative, then either: @@ -14662,12 +14348,10 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, * and rely on inferring new ones from the unsigned bounds and * var_off of the result. */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; dst_reg->var_off = tnum_rshift(subreg, umin_val); - dst_reg->u32_min_value >>= umax_val; - dst_reg->u32_max_value >>= umin_val; + reg_set_urange32(dst_reg, reg_u32_min(dst_reg) >> umax_val, + reg_u32_max(dst_reg) >> umin_val); __mark_reg64_unbounded(dst_reg); __update_reg32_bounds(dst_reg); @@ -14676,8 +14360,8 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umax_val = src_reg->umax_value; - u64 umin_val = src_reg->umin_value; + u64 umax_val = reg_umax(src_reg); + u64 umin_val = reg_umin(src_reg); /* BPF_RSH is an unsigned shift. If the value in dst_reg might * be negative, then either: @@ -14693,11 +14377,9 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, * and rely on inferring new ones from the unsigned bounds and * var_off of the result. */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); - dst_reg->umin_value >>= umax_val; - dst_reg->umax_value >>= umin_val; + reg_set_urange64(dst_reg, reg_umin(dst_reg) >> umax_val, + reg_umax(dst_reg) >> umin_val); /* Its not easy to operate on alu32 bounds here because it depends * on bits being shifted in. Take easy way out and mark unbounded @@ -14710,22 +14392,19 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umin_val = src_reg->u32_min_value; + u64 umin_val = reg_u32_min(src_reg); /* Upon reaching here, src_known is true and * umax_val is equal to umin_val. + * Blow away the dst_reg umin_value/umax_value and rely on + * dst_reg var_off to refine the result. */ - dst_reg->s32_min_value = (u32)(((s32)dst_reg->s32_min_value) >> umin_val); - dst_reg->s32_max_value = (u32)(((s32)dst_reg->s32_max_value) >> umin_val); + reg_set_srange32(dst_reg, + (u32)(((s32)reg_s32_min(dst_reg)) >> umin_val), + (u32)(((s32)reg_s32_max(dst_reg)) >> umin_val)); dst_reg->var_off = tnum_arshift(tnum_subreg(dst_reg->var_off), umin_val, 32); - /* blow away the dst_reg umin_value/umax_value and rely on - * dst_reg var_off to refine the result. - */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; - __mark_reg64_unbounded(dst_reg); __update_reg32_bounds(dst_reg); } @@ -14733,22 +14412,16 @@ static void scalar32_min_max_arsh(struct bpf_reg_state *dst_reg, static void scalar_min_max_arsh(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - u64 umin_val = src_reg->umin_value; + u64 umin_val = reg_umin(src_reg); /* Upon reaching here, src_known is true and umax_val is equal * to umin_val. */ - dst_reg->smin_value >>= umin_val; - dst_reg->smax_value >>= umin_val; + reg_set_srange64(dst_reg, reg_smin(dst_reg) >> umin_val, + reg_smax(dst_reg) >> umin_val); dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, 64); - /* blow away the dst_reg umin_value/umax_value and rely on - * dst_reg var_off to refine the result. - */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - /* Its not easy to operate on alu32 bounds here because it depends * on bits being shifted in from upper 32-bits. Take easy way out * and mark unbounded so we can recalculate later from tnum. @@ -14814,13 +14487,13 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, if (insn_bitness == 32) { if (tnum_subreg_is_const(src_reg->var_off) - && src_reg->s32_min_value == src_reg->s32_max_value - && src_reg->u32_min_value == src_reg->u32_max_value) + && reg_s32_min(src_reg) == reg_s32_max(src_reg) + && reg_u32_min(src_reg) == reg_u32_max(src_reg)) src_is_const = true; } else { if (tnum_is_const(src_reg->var_off) - && src_reg->smin_value == src_reg->smax_value - && src_reg->umin_value == src_reg->umax_value) + && reg_smin(src_reg) == reg_smax(src_reg) + && reg_umin(src_reg) == reg_umax(src_reg)) src_is_const = true; } @@ -14850,7 +14523,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, case BPF_LSH: case BPF_RSH: case BPF_ARSH: - return (src_is_const && src_reg->umax_value < insn_bitness); + return (src_is_const && reg_umax(src_reg) < insn_bitness); default: return false; } @@ -14863,9 +14536,9 @@ static int maybe_fork_scalars(struct bpf_verifier_env *env, struct bpf_insn *ins struct bpf_reg_state *regs; bool alu32; - if (dst_reg->smin_value == -1 && dst_reg->smax_value == 0) + if (reg_smin(dst_reg) == -1 && reg_smax(dst_reg) == 0) alu32 = false; - else if (dst_reg->s32_min_value == -1 && dst_reg->s32_max_value == 0) + else if (reg_s32_min(dst_reg) == -1 && reg_s32_max(dst_reg) == 0) alu32 = true; else return 0; @@ -14949,7 +14622,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, break; case BPF_DIV: /* BPF div specification: x / 0 = 0 */ - if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) { + if ((alu32 && reg_u32_min(&src_reg) == 0) || (!alu32 && reg_umin(&src_reg) == 0)) { ___mark_reg_known(dst_reg, 0); break; } @@ -14966,7 +14639,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, break; case BPF_MOD: /* BPF mod specification: x % 0 = x */ - if ((alu32 && src_reg.u32_min_value == 0) || (!alu32 && src_reg.umin_value == 0)) + if ((alu32 && reg_u32_min(&src_reg) == 0) || (!alu32 && reg_umin(&src_reg) == 0)) break; if (alu32) if (off == 1) @@ -15154,7 +14827,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * umax_value before the ALU operation. After adjust_scalar_min_max_vals(), * alu32 ops will have zero-extended the result, making umax_value <= U32_MAX. */ - u64 dst_umax = dst_reg->umax_value; + u64 dst_umax = reg_umax(dst_reg); err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) @@ -15284,7 +14957,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * copy register state to dest reg */ assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* case: R1 = (s8, s16 s32)R2 */ @@ -15296,10 +14969,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (src_reg->type == SCALAR_VALUE) { bool no_sext; - no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); + no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1)); if (no_sext) assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; if (!no_sext) clear_scalar_id(dst_reg); coerce_reg_to_size_sx(dst_reg, insn->off >> 3); @@ -15321,7 +14994,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (is_src_reg_u32) assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; /* Make sure ID is cleared if src_reg is not in u32 * range otherwise dst_reg min/max could be incorrectly * propagated into src_reg by sync_linked_regs() @@ -15331,11 +15004,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->subreg_def = env->insn_idx + 1; } else { /* case: W1 = (s8, s16)W2 */ - bool no_sext = src_reg->umax_value < (1ULL << (insn->off - 1)); + bool no_sext = reg_umax(src_reg) < (1ULL << (insn->off - 1)); if (no_sext) assign_scalar_id_before_mov(env, src_reg); - copy_register_state(dst_reg, src_reg); + *dst_reg = *src_reg; if (!no_sext) clear_scalar_id(dst_reg); dst_reg->subreg_def = env->insn_idx + 1; @@ -15413,17 +15086,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *reg; int new_range; - if (dst_reg->umax_value == 0 && range_right_open) + if (reg_umax(dst_reg) == 0 && range_right_open) /* This doesn't give us any range */ return; - if (dst_reg->umax_value > MAX_PACKET_OFF) + if (reg_umax(dst_reg) > MAX_PACKET_OFF) /* Risk of overflow. For instance, ptr + (1<<63) may be less * than pkt_end, but that's because it's also less than pkt. */ return; - new_range = dst_reg->umax_value; + new_range = reg_umax(dst_reg); if (range_right_open) new_range++; @@ -15472,7 +15145,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, /* If our ids match, then we must have the same max_value. And we * don't care about the other reg's fixed offset, since if it's too big * the range won't allow anything. - * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16. + * reg_umax(dst_reg) is known < MAX_PACKET_OFF, therefore it fits in a u16. */ bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == type && reg->id == dst_reg->id) @@ -15528,14 +15201,14 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s { struct tnum t1 = is_jmp32 ? tnum_subreg(reg1->var_off) : reg1->var_off; struct tnum t2 = is_jmp32 ? tnum_subreg(reg2->var_off) : reg2->var_off; - u64 umin1 = is_jmp32 ? (u64)reg1->u32_min_value : reg1->umin_value; - u64 umax1 = is_jmp32 ? (u64)reg1->u32_max_value : reg1->umax_value; - s64 smin1 = is_jmp32 ? (s64)reg1->s32_min_value : reg1->smin_value; - s64 smax1 = is_jmp32 ? (s64)reg1->s32_max_value : reg1->smax_value; - u64 umin2 = is_jmp32 ? (u64)reg2->u32_min_value : reg2->umin_value; - u64 umax2 = is_jmp32 ? (u64)reg2->u32_max_value : reg2->umax_value; - s64 smin2 = is_jmp32 ? (s64)reg2->s32_min_value : reg2->smin_value; - s64 smax2 = is_jmp32 ? (s64)reg2->s32_max_value : reg2->smax_value; + u64 umin1 = is_jmp32 ? (u64)reg_u32_min(reg1) : reg_umin(reg1); + u64 umax1 = is_jmp32 ? (u64)reg_u32_max(reg1) : reg_umax(reg1); + s64 smin1 = is_jmp32 ? (s64)reg_s32_min(reg1) : reg_smin(reg1); + s64 smax1 = is_jmp32 ? (s64)reg_s32_max(reg1) : reg_smax(reg1); + u64 umin2 = is_jmp32 ? (u64)reg_u32_min(reg2) : reg_umin(reg2); + u64 umax2 = is_jmp32 ? (u64)reg_u32_max(reg2) : reg_umax(reg2); + s64 smin2 = is_jmp32 ? (s64)reg_s32_min(reg2) : reg_smin(reg2); + s64 smax2 = is_jmp32 ? (s64)reg_s32_max(reg2) : reg_smax(reg2); if (reg1 == reg2) { switch (opcode) { @@ -15580,11 +15253,11 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s * utilize 32-bit subrange knowledge to eliminate * branches that can't be taken a priori */ - if (reg1->u32_min_value > reg2->u32_max_value || - reg1->u32_max_value < reg2->u32_min_value) + if (reg_u32_min(reg1) > reg_u32_max(reg2) || + reg_u32_max(reg1) < reg_u32_min(reg2)) return 0; - if (reg1->s32_min_value > reg2->s32_max_value || - reg1->s32_max_value < reg2->s32_min_value) + if (reg_s32_min(reg1) > reg_s32_max(reg2) || + reg_s32_max(reg1) < reg_s32_min(reg2)) return 0; } break; @@ -15606,11 +15279,11 @@ static int is_scalar_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_s * utilize 32-bit subrange knowledge to eliminate * branches that can't be taken a priori */ - if (reg1->u32_min_value > reg2->u32_max_value || - reg1->u32_max_value < reg2->u32_min_value) + if (reg_u32_min(reg1) > reg_u32_max(reg2) || + reg_u32_max(reg1) < reg_u32_min(reg2)) return 1; - if (reg1->s32_min_value > reg2->s32_max_value || - reg1->s32_max_value < reg2->s32_min_value) + if (reg_s32_min(reg1) > reg_s32_max(reg2) || + reg_s32_max(reg1) < reg_s32_min(reg2)) return 1; } break; @@ -15765,7 +15438,7 @@ static int is_branch_taken(struct bpf_verifier_env *env, struct bpf_reg_state *r if (!is_reg_const(reg2, is_jmp32)) return -1; - if (!reg_not_null(reg1)) + if (!reg_not_null(env, reg1)) return -1; /* If pointer is valid tests against zero will fail so we can @@ -15837,27 +15510,15 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state switch (opcode) { case BPF_JEQ: if (is_jmp32) { - reg1->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); - reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); - reg1->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); - reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); - reg2->u32_min_value = reg1->u32_min_value; - reg2->u32_max_value = reg1->u32_max_value; - reg2->s32_min_value = reg1->s32_min_value; - reg2->s32_max_value = reg1->s32_max_value; + reg1->r32 = cnum32_intersect(reg1->r32, reg2->r32); + reg2->r32 = reg1->r32; t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); reg2->var_off = tnum_with_subreg(reg2->var_off, t); } else { - reg1->umin_value = max(reg1->umin_value, reg2->umin_value); - reg1->umax_value = min(reg1->umax_value, reg2->umax_value); - reg1->smin_value = max(reg1->smin_value, reg2->smin_value); - reg1->smax_value = min(reg1->smax_value, reg2->smax_value); - reg2->umin_value = reg1->umin_value; - reg2->umax_value = reg1->umax_value; - reg2->smin_value = reg1->smin_value; - reg2->smax_value = reg1->smax_value; + reg1->r64 = cnum64_intersect(reg1->r64, reg2->r64); + reg2->r64 = reg1->r64; reg1->var_off = tnum_intersect(reg1->var_off, reg2->var_off); reg2->var_off = reg1->var_off; @@ -15874,32 +15535,11 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state */ val = reg_const_value(reg2, is_jmp32); if (is_jmp32) { - /* u32_min_value is not equal to 0xffffffff at this point, - * because otherwise u32_max_value is 0xffffffff as well, - * in such a case both reg1 and reg2 would be constants, - * jump would be predicted and regs_refine_cond_op() - * wouldn't be called. - * - * Same reasoning works for all {u,s}{min,max}{32,64} cases - * below. - */ - if (reg1->u32_min_value == (u32)val) - reg1->u32_min_value++; - if (reg1->u32_max_value == (u32)val) - reg1->u32_max_value--; - if (reg1->s32_min_value == (s32)val) - reg1->s32_min_value++; - if (reg1->s32_max_value == (s32)val) - reg1->s32_max_value--; + /* Complement of the range [val, val] as cnum32. */ + cnum32_intersect_with(®1->r32, (struct cnum32){ val + 1, U32_MAX - 1 }); } else { - if (reg1->umin_value == (u64)val) - reg1->umin_value++; - if (reg1->umax_value == (u64)val) - reg1->umax_value--; - if (reg1->smin_value == (s64)val) - reg1->smin_value++; - if (reg1->smax_value == (s64)val) - reg1->smax_value--; + /* Complement of the range [val, val] as cnum64. */ + cnum64_intersect_with(®1->r64, (struct cnum64){ val + 1, U64_MAX - 1 }); } break; case BPF_JSET: @@ -15946,38 +15586,38 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state break; case BPF_JLE: if (is_jmp32) { - reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value); - reg2->u32_min_value = max(reg1->u32_min_value, reg2->u32_min_value); + cnum32_intersect_with_urange(®1->r32, 0, reg_u32_max(reg2)); + cnum32_intersect_with_urange(®2->r32, reg_u32_min(reg1), U32_MAX); } else { - reg1->umax_value = min(reg1->umax_value, reg2->umax_value); - reg2->umin_value = max(reg1->umin_value, reg2->umin_value); + cnum64_intersect_with_urange(®1->r64, 0, reg_umax(reg2)); + cnum64_intersect_with_urange(®2->r64, reg_umin(reg1), U64_MAX); } break; case BPF_JLT: if (is_jmp32) { - reg1->u32_max_value = min(reg1->u32_max_value, reg2->u32_max_value - 1); - reg2->u32_min_value = max(reg1->u32_min_value + 1, reg2->u32_min_value); + cnum32_intersect_with_urange(®1->r32, 0, reg_u32_max(reg2) - 1); + cnum32_intersect_with_urange(®2->r32, reg_u32_min(reg1) + 1, U32_MAX); } else { - reg1->umax_value = min(reg1->umax_value, reg2->umax_value - 1); - reg2->umin_value = max(reg1->umin_value + 1, reg2->umin_value); + cnum64_intersect_with_urange(®1->r64, 0, reg_umax(reg2) - 1); + cnum64_intersect_with_urange(®2->r64, reg_umin(reg1) + 1, U64_MAX); } break; case BPF_JSLE: if (is_jmp32) { - reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value); - reg2->s32_min_value = max(reg1->s32_min_value, reg2->s32_min_value); + cnum32_intersect_with_srange(®1->r32, S32_MIN, reg_s32_max(reg2)); + cnum32_intersect_with_srange(®2->r32, reg_s32_min(reg1), S32_MAX); } else { - reg1->smax_value = min(reg1->smax_value, reg2->smax_value); - reg2->smin_value = max(reg1->smin_value, reg2->smin_value); + cnum64_intersect_with_srange(®1->r64, S64_MIN, reg_smax(reg2)); + cnum64_intersect_with_srange(®2->r64, reg_smin(reg1), S64_MAX); } break; case BPF_JSLT: if (is_jmp32) { - reg1->s32_max_value = min(reg1->s32_max_value, reg2->s32_max_value - 1); - reg2->s32_min_value = max(reg1->s32_min_value + 1, reg2->s32_min_value); + cnum32_intersect_with_srange(®1->r32, S32_MIN, reg_s32_max(reg2) - 1); + cnum32_intersect_with_srange(®2->r32, reg_s32_min(reg1) + 1, S32_MAX); } else { - reg1->smax_value = min(reg1->smax_value, reg2->smax_value - 1); - reg2->smin_value = max(reg1->smin_value + 1, reg2->smin_value); + cnum64_intersect_with_srange(®1->r64, S64_MIN, reg_smax(reg2) - 1); + cnum64_intersect_with_srange(®2->r64, reg_smin(reg1) + 1, S64_MAX); } break; default: @@ -16015,7 +15655,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0))) return; if (is_null) { - /* We don't need id and ref_obj_id from this point + /* We don't need id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ @@ -16027,15 +15667,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, mark_ptr_not_null_reg(reg); - if (!reg_may_point_to_spin_lock(reg)) { - /* For not-NULL ptr, reg->ref_obj_id will be reset - * in release_reference(). - * - * reg->id is still used by spin_lock ptr. Other - * than spin_lock ptr type, reg->id can be reset. - */ - reg->id = 0; - } + /* + * reg->id is preserved for object relationship tracking + * and spin_lock lock state tracking + */ } } @@ -16047,10 +15682,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, { struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *reg; - u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - if (ref_obj_id && ref_obj_id == id && is_null) + if (is_null && find_reference_state(vstate, id)) /* regs[regno] is in the " == NULL" branch. * No one could have freed the reference state before * doing the NULL check. @@ -16248,7 +15882,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s reg->delta == known_reg->delta) { s32 saved_subreg_def = reg->subreg_def; - copy_register_state(reg, known_reg); + *reg = *known_reg; reg->subreg_def = saved_subreg_def; } else { s32 saved_subreg_def = reg->subreg_def; @@ -16259,7 +15893,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s __mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta); /* reg = known_reg; reg += delta */ - copy_register_state(reg, known_reg); + *reg = *known_reg; /* * Must preserve off, id and subreg_def flag, * otherwise another sync_linked_regs() will be incorrect. @@ -16356,16 +15990,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (insn_flags) { - err = bpf_push_jmp_history(env, this_branch, insn_flags, 0); + err = bpf_push_jmp_history(env, this_branch, insn_flags, 0, 0, 0); if (err) return err; } is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; - copy_register_state(&env->false_reg1, dst_reg); - copy_register_state(&env->false_reg2, src_reg); - copy_register_state(&env->true_reg1, dst_reg); - copy_register_state(&env->true_reg2, src_reg); + env->false_reg1 = *dst_reg; + env->false_reg2 = *src_reg; + env->true_reg1 = *dst_reg; + env->true_reg2 = *src_reg; pred = is_branch_taken(env, dst_reg, src_reg, opcode, is_jmp32); if (pred >= 0) { /* If we get here with a dst_reg pointer type it is because @@ -16420,7 +16054,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && dst_reg->id) collect_linked_regs(env, this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = bpf_push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = bpf_push_jmp_history(env, this_branch, 0, 0, 0, linked_regs_pack(&linked_regs)); if (err) return err; } @@ -16434,11 +16068,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; - copy_register_state(dst_reg, &env->false_reg1); - copy_register_state(src_reg, &env->false_reg2); - copy_register_state(&other_branch_regs[insn->dst_reg], &env->true_reg1); + *dst_reg = env->false_reg1; + *src_reg = env->false_reg2; + other_branch_regs[insn->dst_reg] = env->true_reg1; if (BPF_SRC(insn->code) == BPF_X) - copy_register_state(&other_branch_regs[insn->src_reg], &env->true_reg2); + other_branch_regs[insn->src_reg] = env->true_reg2; if (BPF_SRC(insn->code) == BPF_X && src_reg->type == SCALAR_VALUE && src_reg->id && @@ -16773,6 +16407,9 @@ static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_ case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: + case BPF_TRACE_FSESSION_MULTI: *range = retval_range(0, 0); break; case BPF_TRACE_RAW_TP: @@ -16889,8 +16526,8 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char ret_type = btf_type_resolve_ptr(prog->aux->attach_btf, prog->aux->attach_func_proto->type, NULL); - if (ret_type && ret_type == reg_type && reg->ref_obj_id) - return __check_ptr_off_reg(env, reg, regno, false); + if (ret_type && ret_type == reg_type && reg_is_referenced(env, reg)) + return __check_ptr_off_reg(env, reg, argno_from_reg(regno), false); } /* eBPF calling convention is such that R0 is used @@ -16962,6 +16599,10 @@ static int check_global_subprog_return_code(struct bpf_verifier_env *env) if (err) return err; + /* Pointers to arena are safe to pass between subprograms. */ + if (is_arena_reg(env, BPF_REG_0)) + return 0; + if (is_pointer_value(env, BPF_REG_0)) { verbose(env, "R%d leaks addr as return value\n", BPF_REG_0); return -EACCES; @@ -17478,16 +17119,16 @@ static int indirect_jump_min_max_index(struct bpf_verifier_env *env, u32 *pmin_index, u32 *pmax_index) { struct bpf_reg_state *reg = reg_state(env, regno); - u64 min_index = reg->umin_value; - u64 max_index = reg->umax_value; + u64 min_index = reg_umin(reg); + u64 max_index = reg_umax(reg); const u32 size = 8; if (min_index > (u64) U32_MAX * size) { - verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value); + verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg_umin(reg)); return -ERANGE; } if (max_index > (u64) U32_MAX * size) { - verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value); + verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg_umax(reg)); return -ERANGE; } @@ -17586,6 +17227,14 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) return check_store_reg(env, insn, false); case BPF_ST: { + /* Handle stack arg write (store immediate) */ + if (is_stack_arg_st(insn)) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + + return check_stack_arg_write(env, state, insn->off, NULL); + } + enum bpf_reg_type dst_reg_type; err = check_reg_arg(env, insn->dst_reg, SRC_OP); @@ -17594,7 +17243,7 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) dst_reg_type = cur_regs(env)[insn->dst_reg].type; - err = check_mem_access(env, env->insn_idx, insn->dst_reg, + err = check_mem_access(env, env->insn_idx, cur_regs(env) + insn->dst_reg, argno_from_reg(insn->dst_reg), insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, false, false); if (err) @@ -17620,6 +17269,8 @@ static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) } } mark_reg_scratched(env, BPF_REG_0); + if (bpf_in_stack_arg_cnt(&env->subprog_info[cur_func(env)->subprogno])) + cur_func(env)->no_stack_arg_load = true; if (insn->src_reg == BPF_PSEUDO_CALL) return check_func_call(env, insn, &env->insn_idx); if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) @@ -17717,7 +17368,7 @@ static int do_check(struct bpf_verifier_env *env) } if (bpf_is_jmp_point(env, env->insn_idx)) { - err = bpf_push_jmp_history(env, state, 0, 0); + err = bpf_push_jmp_history(env, state, 0, 0, 0, 0); if (err) return err; } @@ -18102,11 +17753,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, if (prog->sleepable) switch (map->map_type) { case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_RHASH: case BPF_MAP_TYPE_LRU_HASH: case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PERCPU_HASH: case BPF_MAP_TYPE_PERCPU_ARRAY: case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_LPM_TRIE: case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: case BPF_MAP_TYPE_RINGBUF: @@ -18424,11 +18077,12 @@ static int check_and_resolve_insns(struct bpf_verifier_env *env) return err; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->dst_reg >= MAX_BPF_REG) { + if (insn->dst_reg >= MAX_BPF_REG && + !is_stack_arg_st(insn) && !is_stack_arg_stx(insn)) { verbose(env, "R%d is invalid\n", insn->dst_reg); return -EINVAL; } - if (insn->src_reg >= MAX_BPF_REG) { + if (insn->src_reg >= MAX_BPF_REG && !is_stack_arg_ldx(insn)) { verbose(env, "R%d is invalid\n", insn->src_reg); return -EINVAL; } @@ -18735,7 +18389,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) goto out; } } - for (i = BPF_REG_1; i <= sub->arg_cnt; i++) { + for (i = BPF_REG_1; i <= min_t(u32, sub->arg_cnt, MAX_BPF_FUNC_REG_ARGS); i++) { arg = &sub->args[i - BPF_REG_1]; reg = ®s[i]; @@ -18745,9 +18399,9 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) } else if (arg->arg_type == ARG_ANYTHING) { reg->type = SCALAR_VALUE; mark_reg_unknown(env, regs, i); - } else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) { + } else if (arg->arg_type == ARG_PTR_TO_DYNPTR) { /* assume unspecial LOCAL dynptr type */ - __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); + __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen, 0); } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { reg->type = PTR_TO_MEM; reg->type |= arg->arg_type & @@ -18773,11 +18427,17 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_unknown(env, regs, i); } else { verifier_bug(env, "unhandled arg#%d type %d", - i - BPF_REG_1, arg->arg_type); + i - BPF_REG_1 + 1, arg->arg_type); ret = -EFAULT; goto out; } } + if (env->prog->type == BPF_PROG_TYPE_EXT && sub->arg_cnt > MAX_BPF_FUNC_REG_ARGS) { + verbose(env, "freplace programs with >%d args not supported yet\n", + MAX_BPF_FUNC_REG_ARGS); + ret = -EINVAL; + goto out; + } } else { /* if main BPF program has associated BTF info, validate that * it's matching expected signature, and otherwise mark BTF @@ -18785,8 +18445,11 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) */ if (env->prog->aux->func_info_aux) { ret = btf_prepare_func_args(env, 0); - if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX) + if (ret || sub->arg_cnt != 1 || sub->args[0].arg_type != ARG_PTR_TO_CTX) { env->prog->aux->func_info_aux[0].unreliable = true; + sub->arg_cnt = 1; + sub->stack_arg_cnt = 0; + } } /* 1st arg to a function */ @@ -18796,9 +18459,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) /* Acquire references for struct_ops program arguments tagged with "__ref" */ if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { - for (i = 0; i < aux->ctx_arg_info_size; i++) - aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ? - acquire_reference(env, 0) : 0; + for (i = 0; i < aux->ctx_arg_info_size; i++) { + ret = aux->ctx_arg_info[i].refcounted ? acquire_reference(env, 0, 0) : 0; + if (ret < 0) + goto out; + + aux->ctx_arg_info[i].ref_id = ret; + } } ret = do_check(env); @@ -18834,6 +18501,7 @@ static int do_check_subprogs(struct bpf_verifier_env *env) struct bpf_prog_aux *aux = env->prog->aux; struct bpf_func_info_aux *sub_aux; int i, ret, new_cnt; + u32 insn_processed; if (!aux->func_info) return 0; @@ -18848,6 +18516,8 @@ again: if (!bpf_subprog_is_global(env, i)) continue; + insn_processed = env->insn_processed; + sub_aux = subprog_aux(env, i); if (!sub_aux->called || sub_aux->verified) continue; @@ -18855,6 +18525,7 @@ again: env->insn_idx = env->subprog_info[i].start; WARN_ON_ONCE(env->insn_idx == 0); ret = do_check_common(env, i); + env->subprog_info[i].insn_processed = env->insn_processed - insn_processed; if (ret) { return ret; } else if (env->log.level & BPF_LOG_LEVEL) { @@ -18881,10 +18552,12 @@ again: static int do_check_main(struct bpf_verifier_env *env) { + u32 insn_processed = env->insn_processed; int ret; env->insn_idx = 0; ret = do_check_common(env, 0); + env->subprog_info[0].insn_processed = env->insn_processed - insn_processed; if (!ret) env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return ret; @@ -18893,19 +18566,20 @@ static int do_check_main(struct bpf_verifier_env *env) static void print_verification_stats(struct bpf_verifier_env *env) { - int i; + /* Skip over hidden subprogs which are not verified. */ + int i, subprog_cnt = env->subprog_cnt - env->hidden_subprog_cnt; if (env->log.level & BPF_LOG_STATS) { verbose(env, "verification time %lld usec\n", div_u64(env->verification_time, 1000)); - verbose(env, "stack depth "); - for (i = 0; i < env->subprog_cnt; i++) { - u32 depth = env->subprog_info[i].stack_depth; - - verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt) - verbose(env, "+"); - } + verbose(env, "stack depth %d", env->subprog_info[0].stack_depth); + for (i = 1; i < subprog_cnt; i++) + verbose(env, "+%d", env->subprog_info[i].stack_depth); + verbose(env, " max %d\n", env->max_stack_depth); + verbose(env, "insns processed %d", env->subprog_info[0].insn_processed); + for (i = 1; i < subprog_cnt; i++) + if (bpf_subprog_is_global(env, i)) + verbose(env, "+%d", env->subprog_info[i].insn_processed); verbose(env, "\n"); } verbose(env, "processed %d insns (limit %d) max_states_per_insn %d " @@ -19127,6 +18801,60 @@ static int check_attach_modify_return(unsigned long addr, const char *func_name) #endif /* CONFIG_FUNCTION_ERROR_INJECTION */ +static bool is_tracing_multi_id(const struct bpf_prog *prog, u32 btf_id) +{ + return is_tracing_multi(prog->expected_attach_type) && bpf_multi_func_btf_id[0] == btf_id; +} + +static int btf_id_allow_sleepable(u32 btf_id, unsigned long addr, const struct bpf_prog *prog, + const struct btf *btf) +{ + const struct btf_type *t; + const char *tname; + + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + t = btf_type_by_id(btf, btf_id); + if (!t) + return -EINVAL; + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) + return -EINVAL; + + /* + * *.multi sleepable programs will pass initial sleepable check, + * the actual attached btf ids are checked later during the link + * attachment. + */ + if (is_tracing_multi_id(prog, btf_id)) + return 0; + if (!check_attach_sleepable(btf_id, addr, tname)) + return 0; + /* + * fentry/fexit/fmod_ret progs can also be sleepable if they are + * in the fmodret id set with the KF_SLEEPABLE flag. + */ + else { + u32 *flags = btf_kfunc_is_modify_return(btf, btf_id, prog); + + if (flags && (*flags & KF_SLEEPABLE)) + return 0; + } + break; + case BPF_PROG_TYPE_LSM: + /* + * LSM progs check that they are attached to bpf_lsm_*() funcs. + * Only some of them are sleepable. + */ + if (bpf_lsm_is_sleepable_hook(btf_id)) + return 0; + break; + default: + break; + } + return -EINVAL; +} + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, @@ -19249,7 +18977,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, prog_extension && (tgt_prog->expected_attach_type == BPF_TRACE_FENTRY || tgt_prog->expected_attach_type == BPF_TRACE_FEXIT || - tgt_prog->expected_attach_type == BPF_TRACE_FSESSION)) { + tgt_prog->expected_attach_type == BPF_TRACE_FENTRY_MULTI || + tgt_prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION || + tgt_prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) { /* Program extensions can extend all program types * except fentry/fexit. The reason is the following. * The fentry/fexit programs are used for performance @@ -19299,6 +19030,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, btp = bpf_get_raw_tracepoint(tname); if (!btp) return -EINVAL; + if (prog->sleepable && !tracepoint_is_faultable(btp->tp)) { + bpf_log(log, "Sleepable program cannot attach to non-faultable tracepoint %s\n", + tname); + bpf_put_raw_tracepoint(btp); + return -EINVAL; + } fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL, trace_symbol); bpf_put_raw_tracepoint(btp); @@ -19349,7 +19086,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: case BPF_TRACE_FSESSION: - if (prog->expected_attach_type == BPF_TRACE_FSESSION && + case BPF_TRACE_FSESSION_MULTI: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: + if ((prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) && !bpf_jit_supports_fsession()) { bpf_log(log, "JIT does not support fsession\n"); return -EOPNOTSUPP; @@ -19378,7 +19119,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (ret < 0) return ret; - if (tgt_prog) { + /* + * *.multi programs don't need an address during program + * verification, we just take the module ref if needed. + */ + if (is_tracing_multi_id(prog, btf_id)) { + if (btf_is_module(btf)) { + mod = btf_try_get_module(btf); + if (!mod) + return -ENOENT; + } + addr = 0; + } else if (tgt_prog) { if (subprog == 0) addr = (long) tgt_prog->bpf_func; else @@ -19403,32 +19155,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, } if (prog->sleepable) { - ret = -EINVAL; - switch (prog->type) { - case BPF_PROG_TYPE_TRACING: - if (!check_attach_sleepable(btf_id, addr, tname)) - ret = 0; - /* fentry/fexit/fmod_ret progs can also be sleepable if they are - * in the fmodret id set with the KF_SLEEPABLE flag. - */ - else { - u32 *flags = btf_kfunc_is_modify_return(btf, btf_id, - prog); - - if (flags && (*flags & KF_SLEEPABLE)) - ret = 0; - } - break; - case BPF_PROG_TYPE_LSM: - /* LSM progs check that they are attached to bpf_lsm_*() funcs. - * Only some of them are sleepable. - */ - if (bpf_lsm_is_sleepable_hook(btf_id)) - ret = 0; - break; - default: - break; - } + ret = btf_id_allow_sleepable(btf_id, addr, prog, btf); if (ret) { module_put(mod); bpf_log(log, "%s is not sleepable\n", tname); @@ -19515,14 +19242,22 @@ static bool can_be_sleepable(struct bpf_prog *prog) case BPF_MODIFY_RETURN: case BPF_TRACE_ITER: case BPF_TRACE_FSESSION: + case BPF_TRACE_RAW_TP: + case BPF_TRACE_FENTRY_MULTI: + case BPF_TRACE_FEXIT_MULTI: + case BPF_TRACE_FSESSION_MULTI: return true; default: return false; } } - return prog->type == BPF_PROG_TYPE_LSM || - prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || - prog->type == BPF_PROG_TYPE_STRUCT_OPS; + if (prog->type == BPF_PROG_TYPE_LSM) + return prog->expected_attach_type != BPF_LSM_CGROUP; + + return prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || + prog->type == BPF_PROG_TYPE_STRUCT_OPS || + prog->type == BPF_PROG_TYPE_RAW_TRACEPOINT || + prog->type == BPF_PROG_TYPE_TRACEPOINT; } static int check_attach_btf_id(struct bpf_verifier_env *env) @@ -19544,7 +19279,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->sleepable && !can_be_sleepable(prog)) { - verbose(env, "Only fentry/fexit/fsession/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); + verbose(env, "Program of this type cannot be sleepable\n"); return -EINVAL; } @@ -19597,6 +19332,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { verbose(env, "Attaching fexit/fsession/fmod_ret to __noreturn function '%s' is rejected.\n", @@ -19604,6 +19340,14 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return -EINVAL; } + /* + * We don't get trampoline for tracing_multi programs at this point, + * it's done when tracing_multi link is created. + */ + if (prog->type == BPF_PROG_TYPE_TRACING && + is_tracing_multi(prog->expected_attach_type)) + return 0; + key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) @@ -19616,6 +19360,62 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return 0; } +int bpf_check_attach_btf_id_multi(struct btf *btf, struct bpf_prog *prog, u32 btf_id, + struct bpf_attach_target_info *tgt_info) +{ + const struct btf_type *t; + unsigned long addr; + const char *tname; + int err; + + if (!btf_id || !btf) + return -EINVAL; + + /* Check noreturn attachment. */ + if ((prog->expected_attach_type == BPF_TRACE_FEXIT_MULTI || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) && + btf_id_set_contains(&noreturn_deny, btf_id)) + return -EINVAL; + /* Check denied attachment. */ + if (btf_id_set_contains(&btf_id_deny, btf_id)) + return -EINVAL; + + /* Check and get function target data. */ + t = btf_type_by_id(btf, btf_id); + if (!t) + return -EINVAL; + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) + return -EINVAL; + if (!btf_type_is_func(t)) + return -EINVAL; + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + err = btf_distill_func_proto(NULL, btf, t, tname, &tgt_info->fmodel); + if (err < 0) + return err; + if (btf_is_module(btf)) { + /* The bpf program already holds reference to module. */ + if (WARN_ON_ONCE(!prog->aux->mod)) + return -EINVAL; + addr = find_kallsyms_symbol_value(prog->aux->mod, tname); + } else { + addr = kallsyms_lookup_name(tname); + } + if (!addr || !ftrace_location(addr)) + return -ENOENT; + + /* Check sleepable program attachment. */ + if (prog->sleepable) { + err = btf_id_allow_sleepable(btf_id, addr, prog, btf); + if (err) + return err; + } + tgt_info->tgt_addr = addr; + return 0; +} + struct btf *bpf_get_btf_vmlinux(void) { if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { @@ -19834,8 +19634,11 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int struct_meta_reg = BPF_REG_3; int node_offset_reg = BPF_REG_4; - /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */ - if (is_bpf_rbtree_add_kfunc(desc->func_id)) { + /* list_add/rbtree_add have an extra arg (prev/less), + * so args-to-fixup are in diff regs. + */ + if (desc->func_id == special_kfunc_list[KF_bpf_list_add] || + is_bpf_rbtree_add_kfunc(desc->func_id)) { struct_meta_reg = BPF_REG_4; node_offset_reg = BPF_REG_5; } @@ -19853,7 +19656,9 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); *cnt = 1; } else if (desc->func_id == special_kfunc_list[KF_bpf_session_is_return] && - env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + (env->prog->expected_attach_type == BPF_TRACE_FSESSION || + env->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) { + /* * inline the bpf_session_is_return() for fsession: * bool bpf_session_is_return(void *ctx) @@ -19866,7 +19671,8 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_buf[2] = BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1); *cnt = 3; } else if (desc->func_id == special_kfunc_list[KF_bpf_session_cookie] && - env->prog->expected_attach_type == BPF_TRACE_FSESSION) { + (env->prog->expected_attach_type == BPF_TRACE_FSESSION || + env->prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI)) { /* * inline bpf_session_cookie() for fsession: * __u64 *bpf_session_cookie(void *ctx) @@ -19897,12 +19703,12 @@ int bpf_fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return 0; } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, + struct bpf_log_attr *attr_log) { u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; int i, len, ret = -EINVAL, err; - u32 log_true_size; bool is_priv; BTF_TYPE_EMIT(enum bpf_features); @@ -19949,9 +19755,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* user could have requested verbose verifier output * and supplied buffer to store the verification trace */ - ret = bpf_vlog_init(&env->log, attr->log_level, - (char __user *) (unsigned long) attr->log_buf, - attr->log_size); + ret = bpf_vlog_init(&env->log, attr_log->level, attr_log->ubuf, attr_log->size); if (ret) goto err_unlock; @@ -20113,17 +19917,10 @@ skip_full_check: env->prog->aux->verified_insns = env->insn_processed; /* preserve original error even if log finalization is successful */ - err = bpf_vlog_finalize(&env->log, &log_true_size); + err = bpf_log_attr_finalize(attr_log, &env->log); if (err) ret = err; - if (uattr_size >= offsetofend(union bpf_attr, log_true_size) && - copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size), - &log_true_size, sizeof(log_true_size))) { - ret = -EFAULT; - goto err_release_maps; - } - if (ret) goto err_release_maps; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 45c0b1ed687a..38f8d9df8fbc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -197,6 +197,14 @@ static u32 cgrp_dfl_implicit_ss_mask; /* some controllers can be threaded on the default hierarchy */ static u32 cgrp_dfl_threaded_ss_mask; +/* + * Set across rebind_subsystems() to the controllers leaving a hierarchy. + * Guarded by cgroup_mutex. Makes find_existing_css_set() resolve them to the + * root css so the affected tasks are migrated there before + * cgroup_apply_control_disable() kills the per-cgroup csses. + */ +static u32 cgroup_rebind_ss_mask; + /* The list of hierarchy roots */ LIST_HEAD(cgroup_roots); static int cgroup_root_count; @@ -264,10 +272,11 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); +static void kill_css_sync(struct cgroup_subsys_state *css); +static void kill_css_finish(struct cgroup_subsys_state *css); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); -static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -374,11 +383,6 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } -static bool cgroup_has_tasks(struct cgroup *cgrp) -{ - return cgrp->nr_populated_csets; -} - static bool cgroup_is_threaded(struct cgroup *cgrp) { return cgrp->dom_cgrp != cgrp; @@ -407,7 +411,7 @@ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) return false; /* can only have either domain or threaded children */ - if (cgrp->nr_populated_domain_children) + if (READ_ONCE(cgrp->nr_populated_domain_children)) return false; /* and no domain controllers can be enabled */ @@ -759,52 +763,76 @@ static bool css_set_populated(struct css_set *cset) } /** - * cgroup_update_populated - update the populated count of a cgroup - * @cgrp: the target cgroup - * @populated: inc or dec populated count - * - * One of the css_sets associated with @cgrp is either getting its first - * task or losing the last. Update @cgrp->nr_populated_* accordingly. The - * count is propagated towards root so that a given cgroup's - * nr_populated_children is zero iff none of its descendants contain any - * tasks. - * - * @cgrp's interface file "cgroup.populated" is zero if both - * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and - * 1 otherwise. When the sum changes from or to zero, userland is notified - * that the content of the interface file has changed. This can be used to - * detect when @cgrp and its descendants become populated or empty. + * css_update_populated - update the populated state of a css and ancestors + * @css: leaf css whose own populated count is changing + * @populated: inc or dec + * + * One of the css_sets pinned by @css is getting its first task or losing the + * last. Propagate the transition up the parent chain so that a css's + * nr_populated_children is zero iff none of its descendants contain any tasks. + * + * For a cgroup->self walk, also runs cgroup-side bookkeeping at each level: + * domain/threaded child split, deferred-destroy trigger, and notification via + * "cgroup.populated" (zero iff cgrp->self has neither populated csets nor + * populated children; userland is notified on transitions). */ -static void cgroup_update_populated(struct cgroup *cgrp, bool populated) +static void css_update_populated(struct cgroup_subsys_state *css, bool populated) { - struct cgroup *child = NULL; + struct cgroup_subsys_state *child = NULL; int adj = populated ? 1 : -1; lockdep_assert_held(&css_set_lock); do { - bool was_populated = cgroup_is_populated(cgrp); + /* non-NULL only on the cgroup->self walk */ + struct cgroup *cgrp = css_is_self(css) ? css->cgroup : NULL; + bool was_populated = css_is_populated(css); if (!child) { - cgrp->nr_populated_csets += adj; + WRITE_ONCE(css->nr_populated_csets, + css->nr_populated_csets + adj); } else { - if (cgroup_is_threaded(child)) - cgrp->nr_populated_threaded_children += adj; - else - cgrp->nr_populated_domain_children += adj; + WRITE_ONCE(css->nr_populated_children, + css->nr_populated_children + adj); + if (cgrp) { + if (cgroup_is_threaded(child->cgroup)) + WRITE_ONCE(cgrp->nr_populated_threaded_children, + cgrp->nr_populated_threaded_children + adj); + else + WRITE_ONCE(cgrp->nr_populated_domain_children, + cgrp->nr_populated_domain_children + adj); + } } - if (was_populated == cgroup_is_populated(cgrp)) + if (was_populated == css_is_populated(css)) break; - cgroup1_check_for_release(cgrp); - TRACE_CGROUP_PATH(notify_populated, cgrp, - cgroup_is_populated(cgrp)); - cgroup_file_notify(&cgrp->events_file); + /* + * Pair with smp_mb() in kill_css_sync(). Either we observe + * CSS_DYING and queue, or the caller observes our decrement + * and fires synchronously. + */ + smp_mb(); - child = cgrp; - cgrp = cgroup_parent(cgrp); - } while (cgrp); + /* + * Subtree just emptied below a dying css. Fire deferred kill. + * The transition is one-shot for a dying css. + */ + if (was_populated && css_is_dying(css)) { + css_get(css); + WARN_ON_ONCE(!queue_work(cgroup_offline_wq, &css->kill_finish_work)); + } + + if (cgrp) { + cgroup1_check_for_release(cgrp); + TRACE_CGROUP_PATH(notify_populated, cgrp, + cgroup_is_populated(cgrp)); + cgroup_file_notify(&cgrp->events_file); + } + + child = css; + css = css->parent; + } while (css); } /** @@ -812,17 +840,27 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) * @cset: target css_set * @populated: whether @cset is populated or depopulated * - * @cset is either getting the first task or losing the last. Update the - * populated counters of all associated cgroups accordingly. + * @cset is either getting the first task or losing the last. Update the + * populated counters along each linked cgroup's self chain and each + * subsystem css that @cset pins. */ static void css_set_update_populated(struct css_set *cset, bool populated) { struct cgrp_cset_link *link; + struct cgroup_subsys *ss; + int ssid; lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) - cgroup_update_populated(link->cgrp, populated); + css_update_populated(&link->cgrp->self, populated); + + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cset->subsys[ssid]; + + if (css) + css_update_populated(css, populated); + } } /* @@ -1053,7 +1091,15 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * won't change, so no need for locking. */ for_each_subsys(ss, i) { - if (root->subsys_mask & (1UL << i)) { + if (unlikely(cgroup_rebind_ss_mask & (1UL << i))) { + /* + * @ss is leaving this hierarchy and its per-cgroup + * csses are about to be killed. Resolve to the + * surviving root css so the tasks are migrated there. + */ + template[i] = cgroup_css(&root->cgrp, ss); + WARN_ON_ONCE(!template[i]); + } else if (root->subsys_mask & (1UL << i)) { /* * @ss is in this hierarchy, so we want the * effective css from @cgrp. @@ -1823,11 +1869,17 @@ int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask) struct cgroup *scgrp = &cgrp_dfl_root.cgrp; /* - * Controllers from default hierarchy that need to be rebound - * are all disabled together in one go. + * Controllers leaving the default hierarchy are disabled + * together. cgroup_rebind_ss_mask makes cgroup_apply_control() + * migrate their tasks to the root css, so the per-cgroup csses + * are unpopulated when cgroup_finalize_control() kills them. + * Clear it before cgroup_finalize_control(), which does no + * css_set lookup. */ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; + cgroup_rebind_ss_mask = dfl_disable_ss_mask; WARN_ON(cgroup_apply_control(scgrp)); + cgroup_rebind_ss_mask = 0; cgroup_finalize_control(scgrp, 0); } @@ -1841,9 +1893,14 @@ int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask) WARN_ON(!css || cgroup_css(dcgrp, ss)); if (src_root != &cgrp_dfl_root) { - /* disable from the source */ + /* + * Disable from the source, migrating its tasks to the + * root css first (see cgroup_rebind_ss_mask). + */ src_root->subsys_mask &= ~(1 << ssid); + cgroup_rebind_ss_mask = 1 << ssid; WARN_ON(cgroup_apply_control(scgrp)); + cgroup_rebind_ss_mask = 0; cgroup_finalize_control(scgrp, 0); } @@ -2065,7 +2122,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) #endif init_waitqueue_head(&cgrp->offline_waitq); - init_waitqueue_head(&cgrp->dying_populated_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } @@ -2170,7 +2226,7 @@ int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask) hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) - cgroup_update_populated(root_cgrp, true); + css_update_populated(&root_cgrp->self, true); } spin_unlock_irq(&css_set_lock); @@ -3208,7 +3264,7 @@ restart: struct cgroup_subsys_state *css = cgroup_css(dsct, ss); DEFINE_WAIT(wait); - if (!css || !percpu_ref_is_dying(&css->refcnt)) + if (!css || !css_is_dying(css)) continue; cgroup_get_live(dsct); @@ -3375,7 +3431,9 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { - kill_css(css); + kill_css_sync(css); + if (!css_is_populated(css)) + kill_css_finish(css); } else if (!css_visible(css)) { css_clear_dir(css); if (ss->css_reset) @@ -3703,7 +3761,7 @@ static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of, if (!cgrp) return -ENOENT; - cgrp->max_descendants = descendants; + WRITE_ONCE(cgrp->max_descendants, descendants); cgroup_kn_unlock(of->kn); @@ -3746,7 +3804,7 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, if (!cgrp) return -ENOENT; - cgrp->max_depth = depth; + WRITE_ONCE(cgrp->max_depth, depth); cgroup_kn_unlock(of->kn); @@ -5067,10 +5125,12 @@ repeat: task = list_entry(it->task_pos, struct task_struct, cg_list); /* - * Hide tasks that are exiting but not yet removed. Keep zombie - * leaders with live threads visible. + * Hide tasks that are exiting but not yet removed by default. Keep + * zombie leaders with live threads visible. Usages that need to walk + * every existing task can opt out via CSS_TASK_ITER_WITH_DEAD. */ - if ((task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) + if (!(it->flags & CSS_TASK_ITER_WITH_DEAD) && + (task->flags & PF_EXITING) && !atomic_read(&task->signal->live)) goto repeat; if (it->flags & CSS_TASK_ITER_PROCS) { @@ -5514,7 +5574,7 @@ static struct cftype cgroup_psi_files[] = { * css destruction is four-stage process. * * 1. Destruction starts. Killing of the percpu_ref is initiated. - * Implemented in kill_css(). + * Implemented in kill_css_finish(). * * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs * and thus css_tryget_online() is guaranteed to fail, the css can be @@ -5659,6 +5719,22 @@ static void css_release(struct percpu_ref *ref) queue_work(cgroup_release_wq, &css->destroy_work); } +/* + * Deferred kill_css_finish() fired from css_update_populated() once a dying + * css's hierarchical populated state drops to zero. Pinned by css_get() at the + * queue site; matched by css_put() here. + */ +static void kill_css_finish_work_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, kill_finish_work); + + cgroup_lock(); + kill_css_finish(css); + cgroup_unlock(); + css_put(css); +} + static void init_and_link_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { @@ -5672,6 +5748,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); + INIT_WORK(&css->kill_finish_work, kill_css_finish_work_fn); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); @@ -5993,7 +6070,7 @@ out_unlock: /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to - * initiate destruction and put the css ref from kill_css(). + * initiate destruction and put the css ref from kill_css_finish(). */ static void css_killed_work_fn(struct work_struct *work) { @@ -6026,15 +6103,12 @@ static void css_killed_ref_fn(struct percpu_ref *ref) } /** - * kill_css - destroy a css - * @css: css to destroy + * kill_css_sync - synchronous half of css teardown + * @css: css being killed * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget_online() is guaranteed to fail and when - * the reference count reaches zero, @css will be released. + * See cgroup_destroy_locked(). */ -static void kill_css(struct cgroup_subsys_state *css) +static void kill_css_sync(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; @@ -6052,28 +6126,17 @@ static void kill_css(struct cgroup_subsys_state *css) css->flags |= CSS_DYING; /* - * This must happen before css is disassociated with its cgroup. - * See seq_css() for details. + * Pair with smp_mb() in css_update_populated(). Either our + * caller observes the walker's decrement and fires + * synchronously, or the walker observes CSS_DYING and queues. */ - css_clear_dir(css); + smp_mb(); /* - * Killing would put the base ref, but we need to keep it alive - * until after ->css_offline(). - */ - css_get(css); - - /* - * cgroup core guarantees that, by the time ->css_offline() is - * invoked, no new css reference will be given out via - * css_tryget_online(). We can't simply call percpu_ref_kill() and - * proceed to offlining css's because percpu_ref_kill() doesn't - * guarantee that the ref is seen as killed on all CPUs on return. - * - * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. + * This must happen before css is disassociated with its cgroup. + * See seq_css() for details. */ - percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); + css_clear_dir(css); css->cgroup->nr_dying_subsys[ss->id]++; /* @@ -6087,44 +6150,88 @@ static void kill_css(struct cgroup_subsys_state *css) } /** - * cgroup_destroy_locked - the first stage of cgroup destruction + * kill_css_finish - deferred half of css teardown + * @css: css being killed + * + * See cgroup_destroy_locked(). + */ +static void kill_css_finish(struct cgroup_subsys_state *css) +{ + lockdep_assert_held(&cgroup_mutex); + + /* + * Skip on re-entry: cgroup_apply_control_disable() may have killed @css + * earlier. cgroup_destroy_locked() can still walk it because + * offline_css() (which NULLs cgrp->subsys[ssid]) runs async. + */ + if (percpu_ref_is_dying(&css->refcnt)) + return; + + /* + * Killing would put the base ref, but we need to keep it alive until + * after ->css_offline(). + */ + css_get(css); + + /* + * cgroup core guarantees that, by the time ->css_offline() is invoked, + * no new css reference will be given out via css_tryget_online(). We + * can't simply call percpu_ref_kill() and proceed to offlining css's + * because percpu_ref_kill() doesn't guarantee that the ref is seen as + * killed on all CPUs on return. + * + * Use percpu_ref_kill_and_confirm() to get notifications as each css is + * confirmed to be seen as killed on all CPUs. + */ + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); +} + +/** + * cgroup_destroy_locked - destroy @cgrp (called on rmdir) * @cgrp: cgroup to be destroyed * - * css's make use of percpu refcnts whose killing latency shouldn't be - * exposed to userland and are RCU protected. Also, cgroup core needs to - * guarantee that css_tryget_online() won't succeed by the time - * ->css_offline() is invoked. To satisfy all the requirements, - * destruction is implemented in the following two steps. - * - * s1. Verify @cgrp can be destroyed and mark it dying. Remove all - * userland visible parts and start killing the percpu refcnts of - * css's. Set up so that the next stage will be kicked off once all - * the percpu refcnts are confirmed to be killed. - * - * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the - * rest of destruction. Once all cgroup references are gone, the - * cgroup is RCU-freed. - * - * This function implements s1. After this step, @cgrp is gone as far as - * the userland is concerned and a new cgroup with the same name may be - * created. As cgroup doesn't care about the names internally, this - * doesn't cause any problem. + * Tear down @cgrp on behalf of rmdir. Constraints: + * + * - Userspace: rmdir must succeed when cgroup.procs and friends are empty. + * + * - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's + * subtree is still doing kernel work. A task hidden from cgroup.procs (past + * exit_signals() with signal->live cleared) can still schedule, allocate, and + * consume resources until its final context switch. Dying descendants in the + * subtree can host such tasks too. + * + * - Kernel: css_tryget_online() must fail by the time ->css_offline() runs. + * + * The destruction runs in three parts: + * + * - This function: synchronous user-visible state teardown plus kill_css_sync() + * on each subsystem css. + * + * - For each subsys css: fire kill_css_finish() synchronously if the subtree is + * already drained, otherwise rely on css_update_populated() to queue + * kill_finish_work when the last populated cset under the css empties. + * + * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn -> + * ->css_offline() -> release/free. + * + * Return 0 on success, -EBUSY if a userspace-visible task or an online child + * remains. */ static int cgroup_destroy_locked(struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup *tcgrp, *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *css; struct cgrp_cset_link *link; + struct css_task_iter it; + struct task_struct *task; int ssid, ret; lockdep_assert_held(&cgroup_mutex); - /* - * Only migration can raise populated from zero and we're already - * holding cgroup_mutex. - */ - if (cgroup_is_populated(cgrp)) + css_task_iter_start(&cgrp->self, 0, &it); + task = css_task_iter_next(&it); + css_task_iter_end(&it); + if (task) return -EBUSY; /* @@ -6148,9 +6255,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) link->cset->dead = true; spin_unlock_irq(&css_set_lock); - /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) - kill_css(css); + kill_css_sync(css); /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ css_clear_dir(&cgrp->self); @@ -6181,81 +6287,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); + for_each_css(css, ssid, cgrp) { + if (!css_is_populated(css)) + kill_css_finish(css); + } + return 0; }; -/** - * cgroup_drain_dying - wait for dying tasks to leave before rmdir - * @cgrp: the cgroup being removed - * - * cgroup.procs and cgroup.threads use css_task_iter which filters out - * PF_EXITING tasks so that userspace doesn't see tasks that have already been - * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the - * cgroup has non-empty css_sets - is only updated when dying tasks pass through - * cgroup_task_dead() in finish_task_switch(). This creates a window where - * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir - * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no - * tasks. - * - * This function aligns cgroup_has_tasks() with what userspace can observe. If - * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are - * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the - * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief. - * - * This function only concerns itself with this cgroup's own dying tasks. - * Whether the cgroup has children is cgroup_destroy_locked()'s problem. - * - * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we - * retry the full check from scratch. - * - * Must be called with cgroup_mutex held. - */ -static int cgroup_drain_dying(struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) -{ - struct css_task_iter it; - struct task_struct *task; - DEFINE_WAIT(wait); - - lockdep_assert_held(&cgroup_mutex); -retry: - if (!cgroup_has_tasks(cgrp)) - return 0; - - /* Same iterator as cgroup.threads - if any task is visible, it's busy */ - css_task_iter_start(&cgrp->self, 0, &it); - task = css_task_iter_next(&it); - css_task_iter_end(&it); - - if (task) - return -EBUSY; - - /* - * All remaining tasks are PF_EXITING and will pass through - * cgroup_task_dead() shortly. Wait for a kick and retry. - * - * cgroup_has_tasks() can't transition from false to true while we're - * holding cgroup_mutex, but the true to false transition happens - * under css_set_lock (via cgroup_task_dead()). We must retest and - * prepare_to_wait() under css_set_lock. Otherwise, the transition - * can happen between our first test and prepare_to_wait(), and we - * sleep with no one to wake us. - */ - spin_lock_irq(&css_set_lock); - if (!cgroup_has_tasks(cgrp)) { - spin_unlock_irq(&css_set_lock); - return 0; - } - prepare_to_wait(&cgrp->dying_populated_waitq, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&css_set_lock); - mutex_unlock(&cgroup_mutex); - schedule(); - finish_wait(&cgrp->dying_populated_waitq, &wait); - mutex_lock(&cgroup_mutex); - goto retry; -} - int cgroup_rmdir(struct kernfs_node *kn) { struct cgroup *cgrp; @@ -6265,12 +6304,9 @@ int cgroup_rmdir(struct kernfs_node *kn) if (!cgrp) return 0; - ret = cgroup_drain_dying(cgrp); - if (!ret) { - ret = cgroup_destroy_locked(cgrp); - if (!ret) - TRACE_CGROUP_PATH(rmdir, cgrp); - } + ret = cgroup_destroy_locked(cgrp); + if (!ret) + TRACE_CGROUP_PATH(rmdir, cgrp); cgroup_kn_unlock(kn); return ret; @@ -7030,7 +7066,6 @@ void cgroup_task_exit(struct task_struct *tsk) static void do_cgroup_task_dead(struct task_struct *tsk) { - struct cgrp_cset_link *link; struct css_set *cset; unsigned long flags; @@ -7044,11 +7079,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk) if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) list_add_tail(&tsk->cg_list, &cset->dying_tasks); - /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */ - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) - if (waitqueue_active(&link->cgrp->dying_populated_waitq)) - wake_up(&link->cgrp->dying_populated_waitq); - if (dl_task(tsk)) dec_dl_tasks_cs(tsk); diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index bb4e692bea30..f7aaf01f7cd5 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -167,6 +167,7 @@ struct cpuset { */ int nr_deadline_tasks; int nr_migrate_dl_tasks; + /* DL bandwidth that needs destination reservation for this attach. */ u64 sum_migrate_dl_bw; /* * CPU used for temporary DL bandwidth allocation during attach; diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 7308e9b02495..3e9968dd91e9 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -312,7 +312,7 @@ void cpuset1_hotplug_update_tasks(struct cpuset *cs, * This is full cgroup operation which will also call back into * cpuset. Execute it asynchronously using workqueue. */ - if (is_empty && cs->css.cgroup->nr_populated_csets && + if (is_empty && cgroup_has_tasks(cs->css.cgroup) && css_tryget_online(&cs->css)) { struct cpuset_remove_tasks_struct *s; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index e3a081a07c6d..591e3aa487fc 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -432,7 +432,7 @@ static inline bool partition_is_populated(struct cpuset *cs, * nr_populated_domain_children may include populated * csets from descendants that are partitions. */ - if (cs->css.cgroup->nr_populated_csets || + if (cgroup_has_tasks(cs->css.cgroup) || cs->attach_in_progress) return true; @@ -1004,8 +1004,11 @@ void rebuild_sched_domains_locked(void) * prevent the panic. */ for (i = 0; doms && i < ndoms; i++) { - if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) + if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask))) { + free_sched_domains(doms, ndoms); + kfree(attr); return; + } } /* Have scheduler rebuild the domains */ @@ -1718,7 +1721,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ if (is_partition_valid(parent)) adding = cpumask_and(tmp->addmask, - xcpus, parent->effective_xcpus); + cs->effective_xcpus, + parent->effective_xcpus); if (old_prs > 0) new_prs = -old_prs; @@ -1810,9 +1814,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * Compute add/delete mask to/from effective_cpus * * For valid partition: - * addmask = exclusive_cpus & ~newmask + * addmask = effective_xcpus & ~newmask * & parent->effective_xcpus - * delmask = newmask & ~exclusive_cpus + * delmask = newmask & ~effective_xcpus * & parent->effective_xcpus * * For invalid partition: @@ -1824,11 +1828,11 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, deleting = cpumask_and(tmp->delmask, newmask, parent->effective_xcpus); } else { - cpumask_andnot(tmp->addmask, xcpus, newmask); + cpumask_andnot(tmp->addmask, cs->effective_xcpus, newmask); adding = cpumask_and(tmp->addmask, tmp->addmask, parent->effective_xcpus); - cpumask_andnot(tmp->delmask, newmask, xcpus); + cpumask_andnot(tmp->delmask, newmask, cs->effective_xcpus); deleting = cpumask_and(tmp->delmask, tmp->delmask, parent->effective_xcpus); } @@ -1867,7 +1871,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, part_error = PERR_NOCPUS; deleting = false; adding = cpumask_and(tmp->addmask, - xcpus, parent->effective_xcpus); + cs->effective_xcpus, parent->effective_xcpus); } } else { /* @@ -1889,7 +1893,8 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, part_error = PERR_NOCPUS; if (is_partition_valid(cs)) adding = cpumask_and(tmp->addmask, - xcpus, parent->effective_xcpus); + cs->effective_xcpus, + parent->effective_xcpus); } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) && cpumask_subset(xcpus, parent->effective_xcpus)) { struct cgroup_subsys_state *css; @@ -2993,7 +2998,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) struct cpuset *cs, *oldcs; struct task_struct *task; bool setsched_check; - int ret; + int cpu, ret; /* used later by cpuset_attach() */ cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); @@ -3038,31 +3043,31 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) } if (dl_task(task)) { + /* + * Count all migrating DL tasks for cpuset task accounting. + * Only tasks that need a root-domain bandwidth move + * contribute to sum_migrate_dl_bw. + */ cs->nr_migrate_dl_tasks++; - cs->sum_migrate_dl_bw += task->dl.dl_bw; + if (dl_task_needs_bw_move(task, cs->effective_cpus)) + cs->sum_migrate_dl_bw += task->dl.dl_bw; } } - if (!cs->nr_migrate_dl_tasks) + if (!cs->sum_migrate_dl_bw) goto out_success; - if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) { - int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); - - if (unlikely(cpu >= nr_cpu_ids)) { - reset_migrate_dl_data(cs); - ret = -EINVAL; - goto out_unlock; - } + cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); + if (unlikely(cpu >= nr_cpu_ids)) { + ret = -EINVAL; + goto out_unlock; + } - ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); - if (ret) { - reset_migrate_dl_data(cs); - goto out_unlock; - } + ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw); + if (ret) + goto out_unlock; - cs->dl_bw_cpu = cpu; - } + cs->dl_bw_cpu = cpu; out_success: /* @@ -3070,7 +3075,10 @@ out_success: * changes which zero cpus/mems_allowed. */ cs->attach_in_progress++; + out_unlock: + if (ret) + reset_migrate_dl_data(cs); mutex_unlock(&cpuset_mutex); return ret; } @@ -4176,11 +4184,11 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, * yes. If current has access to memory reserves as an oom victim, yes. - * Otherwise, no. + * If the current task is PF_EXITING, yes. Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset - * unless the task has been OOM killed. + * unless the task has been OOM killed or is exiting. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * @@ -4194,7 +4202,9 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, * so no allocation on a node outside the cpuset is allowed (unless - * in interrupt, of course). + * in interrupt, of course). The PF_EXITING check must therefore + * come before the __GFP_HARDWALL check, otherwise a dying task + * would be blocked on the fast path. * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() @@ -4204,6 +4214,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * tsk_is_oom_victim - any node ok + * PF_EXITING - any node ok (let dying task exit quickly) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ @@ -4223,10 +4234,12 @@ bool cpuset_current_node_allowed(int node, gfp_t gfp_mask) */ if (unlikely(tsk_is_oom_victim(current))) return true; + if (current->flags & PF_EXITING) /* Let dying task have memory */ + return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return false; - if (current->flags & PF_EXITING) /* Let dying task have memory */ + if (cpuset_v2()) return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 1ab1fb47f271..4753a67d0f0f 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -602,6 +602,7 @@ get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) pool = NULL; continue; } + pool = ERR_PTR(-ENOMEM); } } diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 4fdab4cf49e0..5e82a03b3270 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -9,6 +9,7 @@ */ #include <linux/bitops.h> +#include <linux/limits.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/cgroup.h> @@ -17,6 +18,22 @@ #define RDMACG_MAX_STR "max" +enum rdmacg_limit_tokens { + RDMACG_HCA_HANDLE_VAL, + RDMACG_HCA_HANDLE_MAX, + RDMACG_HCA_OBJECT_VAL, + RDMACG_HCA_OBJECT_MAX, + NR_RDMACG_LIMIT_TOKENS, +}; + +static const match_table_t rdmacg_limit_tokens = { + { RDMACG_HCA_HANDLE_VAL, "hca_handle=%d" }, + { RDMACG_HCA_HANDLE_MAX, "hca_handle=max" }, + { RDMACG_HCA_OBJECT_VAL, "hca_object=%d" }, + { RDMACG_HCA_OBJECT_MAX, "hca_object=max" }, + { NR_RDMACG_LIMIT_TOKENS, NULL }, +}; + /* * Protects list of resource pools maintained on per cgroup basis * and rdma device list. @@ -27,6 +44,7 @@ static LIST_HEAD(rdmacg_devices); enum rdmacg_file_type { RDMACG_RESOURCE_TYPE_MAX, RDMACG_RESOURCE_TYPE_STAT, + RDMACG_RESOURCE_TYPE_PEAK, }; /* @@ -43,6 +61,7 @@ static char const *rdmacg_resource_names[] = { struct rdmacg_resource { int max; int usage; + int peak; }; /* @@ -62,6 +81,12 @@ struct rdmacg_resource_pool { u64 usage_sum; /* total number counts which are set to max */ int num_max_cnt; + + /* per-resource event counters */ + u64 events_max[RDMACG_RESOURCE_MAX]; + u64 events_alloc_fail[RDMACG_RESOURCE_MAX]; + u64 events_local_max[RDMACG_RESOURCE_MAX]; + u64 events_local_alloc_fail[RDMACG_RESOURCE_MAX]; }; static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) @@ -109,6 +134,26 @@ static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) kfree(rpool); } +static bool rpool_has_persistent_state(struct rdmacg_resource_pool *rpool) +{ + int i; + + /* + * Keep the rpool alive if any peak value is non-zero, + * so that rdma.peak persists as a historical high- + * watermark even after all resources are freed. + */ + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { + if (rpool->resources[i].peak || + rpool->events_max[i] || + rpool->events_local_max[i] || + rpool->events_alloc_fail[i] || + rpool->events_local_alloc_fail[i]) + return true; + } + return false; +} + static struct rdmacg_resource_pool * find_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) @@ -187,11 +232,67 @@ uncharge_cg_locked(struct rdma_cgroup *cg, rpool->usage_sum--; if (rpool->usage_sum == 0 && rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { - /* - * No user of the rpool and all entries are set to max, so - * safe to delete this rpool. - */ - free_cg_rpool_locked(rpool); + if (!rpool_has_persistent_state(rpool)) { + /* + * No user of the rpool and all entries are set to max, so + * safe to delete this rpool. + */ + free_cg_rpool_locked(rpool); + } + } +} + +/** + * rdmacg_event_locked - fire event when resource allocation exceeds limit + * @cg: requesting cgroup + * @over_cg: cgroup whose limit was exceeded + * @device: rdma device + * @index: resource type index + * + * Must be called under rdmacg_mutex. Updates event counters in the + * resource pools of @cg and @over_cg, propagates hierarchical max + * events from @over_cg (including itself) upward, and notifies + * userspace via cgroup_file_notify(). + */ +static void rdmacg_event_locked(struct rdma_cgroup *cg, + struct rdma_cgroup *over_cg, + struct rdmacg_device *device, + enum rdmacg_resource_type index) +{ + struct rdmacg_resource_pool *rpool; + struct rdma_cgroup *p; + + lockdep_assert_held(&rdmacg_mutex); + + /* Increment local alloc_fail in requesting cgroup */ + rpool = find_cg_rpool_locked(cg, device); + if (rpool) { + rpool->events_local_alloc_fail[index]++; + cgroup_file_notify(&cg->events_local_file); + } + + /* Increment local max in the over-limit cgroup */ + rpool = find_cg_rpool_locked(over_cg, device); + if (rpool) { + rpool->events_local_max[index]++; + cgroup_file_notify(&over_cg->events_local_file); + } + + /* Propagate hierarchical max events upward */ + for (p = over_cg; parent_rdmacg(p); p = parent_rdmacg(p)) { + rpool = get_cg_rpool_locked(p, device); + if (!IS_ERR(rpool)) { + rpool->events_max[index]++; + cgroup_file_notify(&p->events_file); + } + } + /* Propagate hierarchical alloc_fail from requesting cgroup upward */ + for (p = cg; parent_rdmacg(p); p = parent_rdmacg(p)) { + rpool = get_cg_rpool_locked(p, device); + if (!IS_ERR(rpool)) { + rpool->events_alloc_fail[index]++; + cgroup_file_notify(&p->events_file); + } } } @@ -293,12 +394,20 @@ int rdmacg_try_charge(struct rdma_cgroup **rdmacg, } } } + /* Update peak only after all charges succeed */ + for (p = cg; p; p = parent_rdmacg(p)) { + rpool = find_cg_rpool_locked(p, device); + if (rpool && rpool->resources[index].usage > rpool->resources[index].peak) + rpool->resources[index].peak = rpool->resources[index].usage; + } mutex_unlock(&rdmacg_mutex); *rdmacg = cg; return 0; err: + if (ret == -EAGAIN) + rdmacg_event_locked(cg, p, device, index); mutex_unlock(&rdmacg_mutex); rdmacg_uncharge_hierarchy(cg, device, p, index); return ret; @@ -355,62 +464,6 @@ void rdmacg_unregister_device(struct rdmacg_device *device) } EXPORT_SYMBOL(rdmacg_unregister_device); -static int parse_resource(char *c, int *intval) -{ - substring_t argstr; - char *name, *value = c; - size_t len; - int ret, i; - - name = strsep(&value, "="); - if (!name || !value) - return -EINVAL; - - i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name); - if (i < 0) - return i; - - len = strlen(value); - - argstr.from = value; - argstr.to = value + len; - - ret = match_int(&argstr, intval); - if (ret >= 0) { - if (*intval < 0) - return -EINVAL; - return i; - } - if (strncmp(value, RDMACG_MAX_STR, len) == 0) { - *intval = S32_MAX; - return i; - } - return -EINVAL; -} - -static int rdmacg_parse_limits(char *options, - int *new_limits, unsigned long *enables) -{ - char *c; - int err = -EINVAL; - - /* parse resource options */ - while ((c = strsep(&options, " ")) != NULL) { - int index, intval; - - index = parse_resource(c, &intval); - if (index < 0) - goto err; - - new_limits[index] = intval; - *enables |= BIT(index); - } - return 0; - -err: - return err; -} - static struct rdmacg_device *rdmacg_get_device_locked(const char *name) { struct rdmacg_device *device; @@ -432,6 +485,7 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, struct rdmacg_resource_pool *rpool; struct rdmacg_device *device; char *options = strstrip(buf); + char *p; int *new_limits; unsigned long enables = 0; int i = 0, ret = 0; @@ -449,9 +503,45 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, goto err; } - ret = rdmacg_parse_limits(options, new_limits, &enables); - if (ret) - goto parse_err; + /* parse resource limit tokens */ + while ((p = strsep(&options, " \t\n"))) { + substring_t args[MAX_OPT_ARGS]; + int tok, intval; + + if (!*p) + continue; + + tok = match_token(p, rdmacg_limit_tokens, args); + switch (tok) { + case RDMACG_HCA_HANDLE_VAL: + if (match_int(&args[0], &intval) || intval < 0) { + ret = -EINVAL; + goto parse_err; + } + new_limits[RDMACG_RESOURCE_HCA_HANDLE] = intval; + enables |= BIT(RDMACG_RESOURCE_HCA_HANDLE); + break; + case RDMACG_HCA_HANDLE_MAX: + new_limits[RDMACG_RESOURCE_HCA_HANDLE] = S32_MAX; + enables |= BIT(RDMACG_RESOURCE_HCA_HANDLE); + break; + case RDMACG_HCA_OBJECT_VAL: + if (match_int(&args[0], &intval) || intval < 0) { + ret = -EINVAL; + goto parse_err; + } + new_limits[RDMACG_RESOURCE_HCA_OBJECT] = intval; + enables |= BIT(RDMACG_RESOURCE_HCA_OBJECT); + break; + case RDMACG_HCA_OBJECT_MAX: + new_limits[RDMACG_RESOURCE_HCA_OBJECT] = S32_MAX; + enables |= BIT(RDMACG_RESOURCE_HCA_OBJECT); + break; + default: + ret = -EINVAL; + goto parse_err; + } + } /* acquire lock to synchronize with hot plug devices */ mutex_lock(&rdmacg_mutex); @@ -474,11 +564,13 @@ static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, if (rpool->usage_sum == 0 && rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { - /* - * No user of the rpool and all entries are set to max, so - * safe to delete this rpool. - */ - free_cg_rpool_locked(rpool); + if (!rpool_has_persistent_state(rpool)) { + /* + * No user of the rpool and all entries are set to max, so + * safe to delete this rpool. + */ + free_cg_rpool_locked(rpool); + } } dev_err: @@ -508,6 +600,8 @@ static void print_rpool_values(struct seq_file *sf, value = rpool->resources[i].max; else value = S32_MAX; + } else if (sf_type == RDMACG_RESOURCE_TYPE_PEAK) { + value = rpool ? rpool->resources[i].peak : 0; } else { if (rpool) value = rpool->resources[i].usage; @@ -544,6 +638,64 @@ static int rdmacg_resource_read(struct seq_file *sf, void *v) return 0; } +static int rdmacg_events_show(struct seq_file *sf, void *v) +{ + struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); + struct rdmacg_resource_pool *rpool; + struct rdmacg_device *device; + int i; + + mutex_lock(&rdmacg_mutex); + + list_for_each_entry(device, &rdmacg_devices, dev_node) { + rpool = find_cg_rpool_locked(cg, device); + + seq_printf(sf, "%s ", device->name); + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { + seq_printf(sf, "%s.max=%llu %s.alloc_fail=%llu", + rdmacg_resource_names[i], + rpool ? rpool->events_max[i] : 0ULL, + rdmacg_resource_names[i], + rpool ? rpool->events_alloc_fail[i] : 0ULL); + if (i < RDMACG_RESOURCE_MAX - 1) + seq_putc(sf, ' '); + } + seq_putc(sf, '\n'); + } + + mutex_unlock(&rdmacg_mutex); + return 0; +} + +static int rdmacg_events_local_show(struct seq_file *sf, void *v) +{ + struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); + struct rdmacg_resource_pool *rpool; + struct rdmacg_device *device; + int i; + + mutex_lock(&rdmacg_mutex); + + list_for_each_entry(device, &rdmacg_devices, dev_node) { + rpool = find_cg_rpool_locked(cg, device); + + seq_printf(sf, "%s ", device->name); + for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { + seq_printf(sf, "%s.max=%llu %s.alloc_fail=%llu", + rdmacg_resource_names[i], + rpool ? rpool->events_local_max[i] : 0ULL, + rdmacg_resource_names[i], + rpool ? rpool->events_local_alloc_fail[i] : 0ULL); + if (i < RDMACG_RESOURCE_MAX - 1) + seq_putc(sf, ' '); + } + seq_putc(sf, '\n'); + } + + mutex_unlock(&rdmacg_mutex); + return 0; +} + static struct cftype rdmacg_files[] = { { .name = "max", @@ -558,6 +710,24 @@ static struct cftype rdmacg_files[] = { .private = RDMACG_RESOURCE_TYPE_STAT, .flags = CFTYPE_NOT_ON_ROOT, }, + { + .name = "peak", + .seq_show = rdmacg_resource_read, + .private = RDMACG_RESOURCE_TYPE_PEAK, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "events", + .seq_show = rdmacg_events_show, + .file_offset = offsetof(struct rdma_cgroup, events_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "events.local", + .seq_show = rdmacg_events_local_show, + .file_offset = offsetof(struct rdma_cgroup, events_local_file), + .flags = CFTYPE_NOT_ON_ROOT, + }, { } /* terminate */ }; @@ -577,6 +747,13 @@ rdmacg_css_alloc(struct cgroup_subsys_state *parent) static void rdmacg_css_free(struct cgroup_subsys_state *css) { struct rdma_cgroup *cg = css_rdmacg(css); + struct rdmacg_resource_pool *rpool, *tmp; + + /* Clean up rpools kept alive by non-zero peak values */ + mutex_lock(&rdmacg_mutex); + list_for_each_entry_safe(rpool, tmp, &cg->rpools, cg_node) + free_cg_rpool_locked(rpool); + mutex_unlock(&rdmacg_mutex); kfree(cg); } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 150e5871e66f..de816a43db9f 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" +#include <linux/cpumask.h> #include <linux/sched/cputime.h> #include <linux/bpf.h> @@ -53,7 +54,7 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) } /** - * css_rstat_updated - keep track of updated rstat_cpu + * __css_rstat_updated - keep track of updated rstat_cpu * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * @@ -63,31 +64,27 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) * * NOTE: if the user needs the guarantee that the updater either add itself in * the lockless list or the concurrent flusher flushes its updated stats, a - * memory barrier is needed before the call to css_rstat_updated() i.e. a + * memory barrier is needed before the call to __css_rstat_updated() i.e. a * barrier after updating the per-cpu stats and before calling - * css_rstat_updated(). + * __css_rstat_updated(). */ -__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) +void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { struct llist_head *lhead; struct css_rstat_cpu *rstatc; struct llist_node *self; - /* - * Since bpf programs can call this function, prevent access to - * uninitialized rstat pointers. - */ + /* Prevent access to uninitialized rstat pointers. */ if (!css_uses_rstat(css)) return; lockdep_assert_preemption_disabled(); /* - * For archs withnot nmi safe cmpxchg or percpu ops support, ignore - * the requests from nmi context. + * The lockless insertion below relies on NMI-safe cmpxchg; + * bail out in NMI on archs that don't provide it. */ - if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || - !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) + if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && in_nmi()) return; rstatc = css_rstat_cpu(css, cpu); @@ -125,6 +122,18 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) llist_add(&rstatc->lnode, lhead); } +/* + * BPF-facing wrapper for __css_rstat_updated(). Validate the caller-provided + * CPU before passing it to the internal rstat updater. + */ +__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) +{ + if (unlikely(cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu))) + return; + + __css_rstat_updated(css, cpu); +} + static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) { /* put @css and all ancestors on the corresponding updated lists */ @@ -170,7 +179,7 @@ static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) * flusher flush the stats updated by the updater who have * observed that they are already on the list. The * corresponding barrier pair for this one should be before - * css_rstat_updated() by the user. + * __css_rstat_updated() by the user. * * For now, there aren't any such user, so not adding the * barrier here but if such a use-case arise, please add @@ -614,7 +623,7 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, unsigned long flags) { u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); - css_rstat_updated(&cgrp->self, smp_processor_id()); + __css_rstat_updated(&cgrp->self, smp_processor_id()); put_cpu_ptr(rstatbc); } diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 7c3924614e01..26831a2a5739 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -22,7 +22,7 @@ CONFIG_SLAB_FREELIST_RANDOM=y CONFIG_SLAB_FREELIST_HARDENED=y CONFIG_SLAB_BUCKETS=y CONFIG_SHUFFLE_PAGE_ALLOCATOR=y -CONFIG_RANDOM_KMALLOC_CACHES=y +CONFIG_KMALLOC_PARTITION_CACHES=y # Sanity check userspace page table mappings. CONFIG_PAGE_TABLE_CHECK=y diff --git a/kernel/cpu.c b/kernel/cpu.c index bc4f7a9ba64e..f975bb34915b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2639,7 +2639,7 @@ static void cpuhp_offline_cpu_device(unsigned int cpu) { struct device *dev = get_cpu_device(cpu); - dev->offline = true; + dev_set_offline(dev); /* Tell user space about the state change */ kobject_uevent(&dev->kobj, KOBJ_OFFLINE); } @@ -2648,7 +2648,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu) { struct device *dev = get_cpu_device(cpu); - dev->offline = false; + dev_clear_offline(dev); /* Tell user space about the state change */ kobject_uevent(&dev->kobj, KOBJ_ONLINE); } diff --git a/kernel/cred.c b/kernel/cred.c index 12a7b1ce5131..3df4e15bd67f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -384,8 +384,9 @@ int commit_creds(struct cred *new) !uid_eq(old->fsuid, new->fsuid) || !gid_eq(old->fsgid, new->fsgid) || !cred_cap_issubset(old, new)) { + /* mm-less tasks share init_task's exec_state */ if (task->mm) - set_dumpable(task->mm, suid_dumpable); + task_exec_state_set_dumpable(suid_dumpable); task->pdeath_signal = 0; /* * If a task drops privileges and becomes nondumpable, diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 1a725edbbbf6..2c0e2cd89b5e 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1251,7 +1251,14 @@ void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, entry->direction = direction; entry->map_err_type = MAP_ERR_NOT_CHECKED; - if (!(attrs & DMA_ATTR_MMIO)) { + if (attrs & DMA_ATTR_MMIO) { + unsigned long pfn = PHYS_PFN(phys); + + if (pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) + err_printk(dev, entry, + "dma_map_resource called for RAM address %pa\n", + &phys); + } else { check_for_stack(dev, phys); if (!PhysHighMem(phys)) @@ -1549,7 +1556,7 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, struct dma_debug_entry ref = { .type = dma_debug_sg, .dev = dev, - .paddr = sg_phys(sg), + .paddr = sg_phys(s), .dev_addr = sg_dma_address(s), .size = sg_dma_len(s), .direction = direction, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index ec887f443741..4391b797d4db 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -39,7 +39,7 @@ static inline struct page *dma_direct_to_page(struct device *dev, u64 dma_direct_get_required_mask(struct device *dev) { - phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT; + phys_addr_t phys = ((phys_addr_t)max_pfn << PAGE_SHIFT) - 1; u64 max_dma = phys_to_dma_direct(dev, phys); return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; @@ -476,7 +476,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, * must be mapped with CPU physical address and not PCI * bus addresses. */ - break; + fallthrough; case PCI_P2PDMA_MAP_NONE: need_sync = true; sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg), @@ -553,7 +553,7 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, int dma_direct_supported(struct device *dev, u64 mask) { - u64 min_mask = (max_pfn - 1) << PAGE_SHIFT; + u64 min_mask = ((u64)max_pfn << PAGE_SHIFT) - 1; /* * Because 32-bit DMA masks are so common we expect every architecture diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 23ed8eb9233e..4eedb1a6273a 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -126,11 +126,9 @@ static bool dma_go_direct(struct device *dev, dma_addr_t mask, if (likely(!ops)) return true; -#ifdef CONFIG_DMA_OPS_BYPASS - if (dev->dma_ops_bypass) + if (IS_ENABLED(CONFIG_DMA_OPS_BYPASS) && dev_dma_ops_bypass(dev)) return min_not_zero(mask, dev->bus_dma_limit) >= dma_direct_get_required_mask(dev); -#endif return false; } @@ -365,10 +363,6 @@ EXPORT_SYMBOL(dma_unmap_sg_attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && - WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) - return DMA_MAPPING_ERROR; - return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_map_resource); @@ -476,7 +470,7 @@ bool dma_need_unmap(struct device *dev) { if (!dma_map_direct(dev, get_dma_ops(dev))) return true; - if (!dev->dma_skip_sync) + if (!dev_dma_skip_sync(dev)) return true; return IS_ENABLED(CONFIG_DMA_API_DEBUG); } @@ -492,16 +486,16 @@ static void dma_setup_need_sync(struct device *dev) * mapping, if any. During the device initialization, it's * enough to check only for the DMA coherence. */ - dev->dma_skip_sync = dev_is_dma_coherent(dev); + dev_assign_dma_skip_sync(dev, dev_is_dma_coherent(dev)); else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu && !ops->sync_sg_for_device && !ops->sync_sg_for_cpu) /* * Synchronization is not possible when none of DMA sync ops * is set. */ - dev->dma_skip_sync = true; + dev_set_dma_skip_sync(dev); else - dev->dma_skip_sync = false; + dev_clear_dma_skip_sync(dev); } #else /* !CONFIG_DMA_NEED_SYNC */ static inline void dma_setup_need_sync(struct device *dev) { } diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 19d2244a9fef..e3d381fd3d25 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -1,11 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/irq-entry-common.h> -#include <linux/resume_user_mode.h> +#include <linux/futex.h> #include <linux/highmem.h> +#include <linux/irq-entry-common.h> #include <linux/jump_label.h> #include <linux/kmsan.h> #include <linux/livepatch.h> +#include <linux/resume_user_mode.h> #include <linux/tick.h> /* Workaround to allow gradual conversion of architecture code */ @@ -60,8 +61,10 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re if (ti_work & _TIF_PATCH_PENDING) klp_update_patch_state(current); - if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { + futex_fixup_robust_unlock(regs); arch_do_signal_or_restart(regs); + } if (ti_work & _TIF_NOTIFY_RESUME) resume_user_mode_work(regs); diff --git a/kernel/events/core.c b/kernel/events/core.c index 6d1f8bad7e1c..b1e1c5f0c7ba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -58,6 +58,7 @@ #include <linux/percpu-rwsem.h> #include <linux/unwind_deferred.h> #include <linux/kvm_types.h> +#include <linux/seq_file.h> #include "internal.h" @@ -7006,6 +7007,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) } static void perf_pmu_output_stop(struct perf_event *event); +static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb); /* * A buffer can be mmap()ed multiple times; either directly through the same @@ -7021,8 +7023,6 @@ static void perf_mmap_close(struct vm_area_struct *vma) mapped_f unmapped = get_mapped(event, event_unmapped); struct perf_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; - int mmap_locked = rb->mmap_locked; - unsigned long size = perf_data_size(rb); bool detach_rest = false; /* FIXIES vs perf_pmu_unregister() */ @@ -7117,11 +7117,7 @@ again: * Aside from that, this buffer is 'fully' detached and unmapped, * undo the VM accounting. */ - - atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, - &mmap_user->locked_vm); - atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); - free_uid(mmap_user); + perf_mmap_unaccount(vma, rb); out_put: ring_buffer_put(rb); /* could be last */ @@ -7261,6 +7257,15 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long atomic64_add(extra, &vma->vm_mm->pinned_vm); } +static void perf_mmap_unaccount(struct vm_area_struct *vma, struct perf_buffer *rb) +{ + struct user_struct *user = rb->mmap_user; + + atomic_long_sub((perf_data_size(rb) >> PAGE_SHIFT) + 1 - rb->mmap_locked, + &user->locked_vm); + atomic64_sub(rb->mmap_locked, &vma->vm_mm->pinned_vm); +} + static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, unsigned long nr_pages) { @@ -7323,8 +7328,6 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (!rb) return -ENOMEM; - refcount_set(&rb->mmap_count, 1); - rb->mmap_user = get_current_user(); rb->mmap_locked = extra; ring_buffer_attach(event, rb); @@ -7474,16 +7477,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) mapped(event, vma->vm_mm); /* - * Try to map it into the page table. On fail, invoke - * perf_mmap_close() to undo the above, as the callsite expects - * full cleanup in this case and therefore does not invoke - * vmops::close(). + * Try to map it into the page table. On fail undo the above, + * as the callsite expects full cleanup in this case and + * therefore does not invoke vmops::close(). */ ret = map_range(event->rb, vma); - if (ret) - perf_mmap_close(vma); + if (likely(!ret)) + return 0; + + /* Error path */ + + /* + * If this is the first mmap(), then event->mmap_count should + * be stable at 1. It is only modified by: + * perf_mmap_{open,close}() and perf_mmap(). + * + * The former are not possible because this mmap() hasn't been + * successful yet, and the latter is serialized by + * event->mmap_mutex which we still hold (note that mmap_lock + * is not strictly sufficient here, because the event fd can + * be passed to another process through trivial means like + * fork(), leading to concurrent mmap() from different mm). + * + * Make sure to remove event->rb before releasing + * event->mmap_mutex, such that any concurrent mmap() will not + * attempt use this failed buffer. + */ + if (refcount_read(&event->mmap_count) == 1) { + /* + * Minimal perf_mmap_close(); there can't be AUX or + * other events on account of this being the first. + */ + mapped = get_mapped(event, event_unmapped); + if (mapped) + mapped(event, vma->vm_mm); + perf_mmap_unaccount(vma, event->rb); + ring_buffer_attach(event, NULL); /* drops last rb->refcount */ + refcount_set(&event->mmap_count, 0); + return ret; + } + + /* + * Otherwise this is an already existing buffer, and there is + * no race vs first exposure, so fall-through and call + * perf_mmap_close(). + */ } + perf_mmap_close(vma); return ret; } @@ -7506,6 +7547,33 @@ static int perf_fasync(int fd, struct file *filp, int on) return 0; } +static void perf_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct perf_event *event = f->private_data; + struct perf_event_context *ctx; + struct mutex *child_mutex; + + ctx = perf_event_ctx_lock(event); + child_mutex = event->parent ? &event->parent->child_mutex : &event->child_mutex; + mutex_lock(child_mutex); + + seq_printf(m, "perf_event_attr.type:\t%u\n", event->orig_type); + if (event->pmu) + seq_printf(m, "pmu_type:\t%u\n", event->pmu->type); + seq_printf(m, "perf_event_attr.config:\t0x%llx\n", (unsigned long long)event->attr.config); + seq_printf(m, "perf_event_attr.config1:\t0x%llx\n", + (unsigned long long)event->attr.config1); + seq_printf(m, "perf_event_attr.config2:\t0x%llx\n", + (unsigned long long)event->attr.config2); + seq_printf(m, "perf_event_attr.config3:\t0x%llx\n", + (unsigned long long)event->attr.config3); + seq_printf(m, "perf_event_attr.config4:\t0x%llx\n", + (unsigned long long)event->attr.config4); + + mutex_unlock(child_mutex); + perf_event_ctx_unlock(event, ctx); +} + static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, @@ -7514,6 +7582,7 @@ static const struct file_operations perf_fops = { .compat_ioctl = perf_compat_ioctl, .mmap = perf_mmap, .fasync = perf_fasync, + .show_fdinfo = perf_show_fdinfo, }; /* @@ -11643,6 +11712,15 @@ static int __perf_event_set_bpf_prog(struct perf_event *event, /* only uprobe programs are allowed to be sleepable */ return -EINVAL; + if (prog->type == BPF_PROG_TYPE_TRACEPOINT && prog->sleepable) { + /* + * Sleepable tracepoint programs can only attach to faultable + * tracepoints. Currently only syscall tracepoints are faultable. + */ + if (!is_syscall_tp) + return -EINVAL; + } + /* Kprobe override only works for kprobes, not uprobes. */ if (prog->kprobe_override && !is_kprobe) return -EINVAL; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index d9cc57083091..c03c4f2eea57 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -67,6 +67,7 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head) struct perf_buffer *rb; rb = container_of(rcu_head, struct perf_buffer, rcu_head); + free_uid(rb->mmap_user); rb_free(rb); } diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 3e7de2661417..9fe92161715e 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -340,6 +340,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) rb->paused = 1; mutex_init(&rb->aux_mutex); + rb->mmap_user = get_current_user(); + refcount_set(&rb->mmap_count, 1); } void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) diff --git a/kernel/exec_state.c b/kernel/exec_state.c new file mode 100644 index 000000000000..6034f4b4808f --- /dev/null +++ b/kernel/exec_state.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */ +#include <linux/init.h> +#include <linux/rcupdate.h> +#include <linux/refcount.h> +#include <linux/sched.h> +#include <linux/sched/coredump.h> +#include <linux/sched/exec_state.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/user_namespace.h> + +static struct kmem_cache *task_exec_state_cachep; + +static void __free_task_exec_state(struct rcu_head *rcu) +{ + struct task_exec_state *exec_state = container_of(rcu, struct task_exec_state, rcu); + + put_user_ns(exec_state->user_ns); + kmem_cache_free(task_exec_state_cachep, exec_state); +} + +void put_task_exec_state(struct task_exec_state *exec_state) +{ + if (exec_state && refcount_dec_and_test(&exec_state->count)) + call_rcu(&exec_state->rcu, __free_task_exec_state); +} + +struct task_exec_state *alloc_task_exec_state(struct user_namespace *user_ns) +{ + struct task_exec_state *exec_state; + + exec_state = kmem_cache_alloc(task_exec_state_cachep, GFP_KERNEL); + if (!exec_state) + return NULL; + refcount_set(&exec_state->count, 1); + exec_state->dumpable = TASK_DUMPABLE_OFF; + exec_state->user_ns = get_user_ns(user_ns); + return exec_state; +} + +struct task_exec_state *task_exec_state_rcu(const struct task_struct *tsk) +{ + struct task_exec_state *exec_state; + + exec_state = rcu_dereference_check(tsk->exec_state, + lockdep_is_held(&tsk->alloc_lock)); + WARN_ON_ONCE(!exec_state); + return exec_state; +} + +struct task_exec_state *task_exec_state_replace(struct task_struct *tsk, + struct task_exec_state *exec_state) +{ + /* + * Updates must hold both locks so callers needing a consistent + * snapshot of mm + dumpability are covered. + */ + lockdep_assert_held(&tsk->alloc_lock); + lockdep_assert_held_write(&tsk->signal->exec_update_lock); + + return rcu_replace_pointer(tsk->exec_state, exec_state, true); +} + +/* + * The non-CLONE_VM clone path: allocate a fresh exec_state and + * inherit the parent's dumpable mode and user_ns reference. CLONE_VM + * siblings refcount-share via copy_exec_state() in fork.c; only this + * path and execve() ever allocate. + */ +int task_exec_state_copy(struct task_struct *tsk) +{ + struct task_exec_state *src, *dst; + + src = rcu_dereference_protected(current->exec_state, true); + dst = alloc_task_exec_state(src->user_ns); + if (!dst) + return -ENOMEM; + dst->dumpable = READ_ONCE(src->dumpable); + rcu_assign_pointer(tsk->exec_state, dst); + return 0; +} + +/* + * Store TASK_DUMPABLE_* on current->exec_state. All callers + * (commit_creds, begin_new_exec, prctl(PR_SET_DUMPABLE)) act on the + * running task, which guarantees ->exec_state is allocated and cannot + * be replaced under us. + */ +void task_exec_state_set_dumpable(enum task_dumpable value) +{ + struct task_exec_state *exec_state; + + if (WARN_ON_ONCE(value > TASK_DUMPABLE_ROOT)) + value = TASK_DUMPABLE_OFF; + + exec_state = rcu_dereference_protected(current->exec_state, true); + /* mm-less tasks share init_task's exec_state; never mutate it */ + if (WARN_ON_ONCE(exec_state == &init_task_exec_state)) + return; + WRITE_ONCE(exec_state->dumpable, value); +} + +enum task_dumpable task_exec_state_get_dumpable(struct task_struct *task) +{ + struct task_exec_state *exec_state; + + guard(rcu)(); + exec_state = rcu_dereference(task->exec_state); + return READ_ONCE(exec_state->dumpable); +} + +void __init exec_state_init(void) +{ + task_exec_state_cachep = kmem_cache_create("task_exec_state", + sizeof(struct task_exec_state), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, + NULL); +} diff --git a/kernel/exit.c b/kernel/exit.c index 25e9cb6de7e7..1056422bc101 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -543,6 +543,32 @@ void mm_update_next_owner(struct mm_struct *mm) } #endif /* CONFIG_MEMCG */ +#if defined(CONFIG_SCHED_CACHE) && defined(CONFIG_NUMA_BALANCING) +/* + * Subtract the memory footprint of the current task from + * mm. + */ +static void exit_mm_sched_cache(struct mm_struct *mm) +{ + unsigned long fp, sub; + + if (!current->total_numa_faults) + return; + /* + * No lock protection due to performance considerations. + * Make sure mm->sc_stat.footprint does not become + * negative. + */ + fp = READ_ONCE(mm->sc_stat.footprint); + sub = min(fp, current->total_numa_faults); + WRITE_ONCE(mm->sc_stat.footprint, fp - sub); +} +#else +static inline void exit_mm_sched_cache(struct mm_struct *mm) +{ +} +#endif /* CONFIG_SCHED_CACHE CONFIG_NUMA_BALANCING */ + /* * Turn us into a lazy TLB process if we * aren't already.. @@ -554,6 +580,9 @@ static void exit_mm(void) exit_mm_release(current, mm); if (!mm) return; + + exit_mm_sched_cache(mm); + mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); @@ -988,8 +1017,8 @@ void __noreturn do_exit(long code) proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX - if (unlikely(current->pi_state_cache)) - kfree(current->pi_state_cache); + if (unlikely(current->futex.pi_state_cache)) + kfree(current->futex.pi_state_cache); #endif /* * Make sure we are holding no locks: @@ -1073,6 +1102,7 @@ void __noreturn make_task_dead(int signr) futex_exit_recursive(tsk); tsk->exit_state = EXIT_DEAD; refcount_inc(&tsk->rcu_users); + preempt_disable(); do_task_dead(); } diff --git a/kernel/fork.c b/kernel/fork.c index 5f3fdfdb14c7..addc555a1077 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -23,6 +23,7 @@ #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/sched/ext.h> +#include <linux/sched/exec_state.h> #include <linux/seq_file.h> #include <linux/rtmutex.h> #include <linux/init.h> @@ -555,6 +556,7 @@ void free_task(struct task_struct *tsk) if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); bpf_task_storage_free(tsk); + put_task_exec_state(rcu_access_pointer(tsk->exec_state)); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -726,12 +728,12 @@ void __mmdrop(struct mm_struct *mm) cleanup_lazy_tlbs(mm); WARN_ON_ONCE(mm == current->active_mm); + mm_destroy_sched(mm); mm_free_pgd(mm); mm_free_id(mm); destroy_context(mm); mmu_notifier_subscriptions_destroy(mm); check_mm(mm); - put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); @@ -946,6 +948,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->seccomp.filter = NULL; #endif + RCU_INIT_POINTER(tsk->exec_state, NULL); + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); @@ -1072,8 +1076,7 @@ static void mmap_init_lock(struct mm_struct *mm) #endif } -static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, - struct user_namespace *user_ns) +static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); @@ -1101,6 +1104,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, #endif mm_init_uprobes_state(mm); hugetlb_count_init(mm); + futex_mm_init(mm); mm_flags_clear_all(mm); if (current->mm) { @@ -1113,11 +1117,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->def_flags = 0; } - if (futex_mm_init(mm)) - goto fail_mm_init; - if (mm_alloc_pgd(mm)) - goto fail_nopgd; + goto fail_mm_init; if (mm_alloc_id(mm)) goto fail_noid; @@ -1128,15 +1129,19 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_cid(mm, p)) goto fail_cid; + if (mm_alloc_sched(mm)) + goto fail_sched; + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS)) goto fail_pcpu; - mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; fail_pcpu: + mm_destroy_sched(mm); +fail_sched: mm_destroy_cid(mm); fail_cid: destroy_context(mm); @@ -1144,8 +1149,6 @@ fail_nocontext: mm_free_id(mm); fail_noid: mm_free_pgd(mm); -fail_nopgd: - futex_hash_free(mm); fail_mm_init: free_mm(mm); return NULL; @@ -1163,7 +1166,7 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - return mm_init(mm, current, current_user_ns()); + return mm_init(mm, current); } EXPORT_SYMBOL_IF_KUNIT(mm_alloc); @@ -1527,7 +1530,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, memcpy(mm, oldmm, sizeof(*mm)); - if (!mm_init(mm, tsk, mm->user_ns)) + if (!mm_init(mm, tsk)) goto fail_nomem; uprobe_start_dup_mmap(); @@ -1593,6 +1596,22 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk) return 0; } +static int copy_exec_state(u64 clone_flags, struct task_struct *tsk) +{ + struct task_exec_state *exec_state; + + /* CLONE_VM siblings refcount-share the parent's exec_state. */ + if (clone_flags & CLONE_VM) { + exec_state = rcu_dereference_protected(current->exec_state, true); + refcount_inc(&exec_state->count); + rcu_assign_pointer(tsk->exec_state, exec_state); + return 0; + } + + /* Everyone else inherits a fresh copy. */ + return task_exec_state_copy(tsk); +} + static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; @@ -2090,6 +2109,9 @@ __latent_entropy struct task_struct *copy_process( p = dup_task_struct(current, node); if (!p) goto fork_out; + retval = copy_exec_state(clone_flags, p); + if (retval) + goto bad_fork_free; p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; @@ -2218,6 +2240,7 @@ __latent_entropy struct task_struct *copy_process( lockdep_init_task(p); p->blocked_on = NULL; /* not blocked yet */ + p->blocked_donor = NULL; /* nobody is boosting p yet */ #ifdef CONFIG_BCACHE p->sequential_io = 0; @@ -2664,8 +2687,6 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. - * - * args->exit_signal is expected to be checked for sanity by the caller. */ pid_t kernel_clone(struct kernel_clone_args *args) { @@ -2700,6 +2721,9 @@ pid_t kernel_clone(struct kernel_clone_args *args) (args->pidfd == args->parent_tid)) return -EINVAL; + if (!valid_signal(args->exit_signal)) + return -EINVAL; + /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly @@ -2898,11 +2922,9 @@ static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs, return -EINVAL; /* - * Verify that higher 32bits of exit_signal are unset and that - * it is a valid signal + * Verify that higher 32bits of exit_signal are unset */ - if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || - !valid_signal(args.exit_signal))) + if (unlikely(args.exit_signal & ~((u64)CSIGNAL))) return -EINVAL; if ((args.flags & CLONE_INTO_CGROUP) && @@ -3098,6 +3120,7 @@ void __init proc_caches_init(void) sizeof(struct signal_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); + exec_state_init(); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, diff --git a/kernel/futex/core.c b/kernel/futex/core.c index ff2a4fb2993f..179b26e9c934 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -32,18 +32,21 @@ * "But they come in a choice of three flavours!" */ #include <linux/compat.h> -#include <linux/jhash.h> -#include <linux/pagemap.h> #include <linux/debugfs.h> -#include <linux/plist.h> +#include <linux/fault-inject.h> #include <linux/gfp.h> -#include <linux/vmalloc.h> +#include <linux/jhash.h> #include <linux/memblock.h> -#include <linux/fault-inject.h> -#include <linux/slab.h> -#include <linux/prctl.h> #include <linux/mempolicy.h> #include <linux/mmap_lock.h> +#include <linux/pagemap.h> +#include <linux/plist.h> +#include <linux/prctl.h> +#include <linux/rseq.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include <vdso/futex.h> #include "futex.h" #include "../locking/rtmutex_common.h" @@ -124,7 +127,7 @@ late_initcall(fail_futex_debugfs); #endif /* CONFIG_FAIL_FUTEX */ static struct futex_hash_bucket * -__futex_hash(union futex_key *key, struct futex_private_hash *fph); +__futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p); #ifdef CONFIG_FUTEX_PRIVATE_HASH static bool futex_ref_get(struct futex_private_hash *fph); @@ -133,15 +136,6 @@ static bool futex_ref_is_dead(struct futex_private_hash *fph); enum { FR_PERCPU = 0, FR_ATOMIC }; -static inline bool futex_key_is_private(union futex_key *key) -{ - /* - * Relies on get_futex_key() to set either bit for shared - * futexes -- see comment with union futex_key. - */ - return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); -} - static bool futex_private_hash_get(struct futex_private_hash *fph) { return futex_ref_get(fph); @@ -149,51 +143,18 @@ static bool futex_private_hash_get(struct futex_private_hash *fph) void futex_private_hash_put(struct futex_private_hash *fph) { - if (futex_ref_put(fph)) + if (fph && futex_ref_put(fph)) wake_up_var(fph->mm); } -/** - * futex_hash_get - Get an additional reference for the local hash. - * @hb: ptr to the private local hash. - * - * Obtain an additional reference for the already obtained hash bucket. The - * caller must already own an reference. - */ -void futex_hash_get(struct futex_hash_bucket *hb) -{ - struct futex_private_hash *fph = hb->priv; - - if (!fph) - return; - WARN_ON_ONCE(!futex_private_hash_get(fph)); -} - -void futex_hash_put(struct futex_hash_bucket *hb) -{ - struct futex_private_hash *fph = hb->priv; - - if (!fph) - return; - futex_private_hash_put(fph); -} - static struct futex_hash_bucket * __futex_hash_private(union futex_key *key, struct futex_private_hash *fph) { u32 hash; - if (!futex_key_is_private(key)) - return NULL; - - if (!fph) - fph = rcu_dereference(key->private.mm->futex_phash); - if (!fph || !fph->hash_mask) - return NULL; - - hash = jhash2((void *)&key->private.address, - sizeof(key->private.address) / 4, + hash = jhash2((void *)&key->private.address, sizeof(key->private.address) / 4, key->both.offset); + return &fph->queues[hash & fph->hash_mask]; } @@ -211,13 +172,12 @@ static void futex_rehash_private(struct futex_private_hash *old, spin_lock(&hb_old->lock); plist_for_each_entry_safe(this, tmp, &hb_old->chain, list) { - plist_del(&this->list, &hb_old->chain); futex_hb_waiters_dec(hb_old); WARN_ON_ONCE(this->lock_ptr != &hb_old->lock); - hb_new = __futex_hash(&this->key, new); + hb_new = __futex_hash(&this->key, new, NULL); futex_hb_waiters_inc(hb_new); /* * The new pointer isn't published yet but an already @@ -232,18 +192,17 @@ static void futex_rehash_private(struct futex_private_hash *old, } } -static bool __futex_pivot_hash(struct mm_struct *mm, - struct futex_private_hash *new) +static bool __futex_pivot_hash(struct mm_struct *mm, struct futex_private_hash *new) { + struct futex_mm_phash *mmph = &mm->futex.phash; struct futex_private_hash *fph; - WARN_ON_ONCE(mm->futex_phash_new); + WARN_ON_ONCE(mmph->hash_new); - fph = rcu_dereference_protected(mm->futex_phash, - lockdep_is_held(&mm->futex_hash_lock)); + fph = rcu_dereference_protected(mmph->hash, lockdep_is_held(&mmph->lock)); if (fph) { if (!futex_ref_is_dead(fph)) { - mm->futex_phash_new = new; + mmph->hash_new = new; return false; } @@ -251,8 +210,8 @@ static bool __futex_pivot_hash(struct mm_struct *mm, } new->state = FR_PERCPU; scoped_guard(rcu) { - mm->futex_batches = get_state_synchronize_rcu(); - rcu_assign_pointer(mm->futex_phash, new); + mmph->batches = get_state_synchronize_rcu(); + rcu_assign_pointer(mmph->hash, new); } kvfree_rcu(fph, rcu); return true; @@ -260,20 +219,19 @@ static bool __futex_pivot_hash(struct mm_struct *mm, static void futex_pivot_hash(struct mm_struct *mm) { - scoped_guard(mutex, &mm->futex_hash_lock) { + scoped_guard(mutex, &mm->futex.phash.lock) { struct futex_private_hash *fph; - fph = mm->futex_phash_new; + fph = mm->futex.phash.hash_new; if (fph) { - mm->futex_phash_new = NULL; + mm->futex.phash.hash_new = NULL; __futex_pivot_hash(mm, fph); } } } -struct futex_private_hash *futex_private_hash(void) +struct futex_private_hash *futex_private_hash(struct mm_struct *mm) { - struct mm_struct *mm = current->mm; /* * Ideally we don't loop. If there is a replacement in progress * then a new private hash is already prepared and a reference can't be @@ -288,7 +246,7 @@ again: scoped_guard(rcu) { struct futex_private_hash *fph; - fph = rcu_dereference(mm->futex_phash); + fph = rcu_dereference(mm->futex.phash.hash); if (!fph) return NULL; @@ -299,18 +257,17 @@ again: goto again; } -struct futex_hash_bucket *futex_hash(union futex_key *key) +struct futex_bucket_ref futex_hash(union futex_key *key) { - struct futex_private_hash *fph; - struct futex_hash_bucket *hb; - again: scoped_guard(rcu) { - hb = __futex_hash(key, NULL); - fph = hb->priv; + struct futex_private_hash *fph = NULL; + struct futex_hash_bucket *hb; + + hb = __futex_hash(key, NULL, &fph); if (!fph || futex_private_hash_get(fph)) - return hb; + return (struct futex_bucket_ref){ .hb = hb, .fph = fph }; } futex_pivot_hash(key->private.mm); goto again; @@ -318,15 +275,9 @@ again: #else /* !CONFIG_FUTEX_PRIVATE_HASH */ -static struct futex_hash_bucket * -__futex_hash_private(union futex_key *key, struct futex_private_hash *fph) +struct futex_bucket_ref futex_hash(union futex_key *key) { - return NULL; -} - -struct futex_hash_bucket *futex_hash(union futex_key *key) -{ - return __futex_hash(key, NULL); + return (struct futex_bucket_ref){ .hb = __futex_hash(key, NULL, NULL), .fph = NULL }; } #endif /* CONFIG_FUTEX_PRIVATE_HASH */ @@ -404,6 +355,8 @@ static int futex_mpol(struct mm_struct *mm, unsigned long addr) * __futex_hash - Return the hash bucket * @key: Pointer to the futex key for which the hash is calculated * @fph: Pointer to private hash if known + * @fph_p: Pointer to a private hash pointer; output for the private hash + * used when set. * * We hash on the keys returned from get_futex_key (see below) and return the * corresponding hash bucket. @@ -412,21 +365,24 @@ static int futex_mpol(struct mm_struct *mm, unsigned long addr) * global hash is returned. */ static struct futex_hash_bucket * -__futex_hash(union futex_key *key, struct futex_private_hash *fph) +__futex_hash(union futex_key *key, struct futex_private_hash *fph, struct futex_private_hash **fph_p) { int node = key->both.node; u32 hash; - if (node == FUTEX_NO_NODE) { - struct futex_hash_bucket *hb; - - hb = __futex_hash_private(key, fph); - if (hb) - return hb; +#ifdef CONFIG_FUTEX_PRIVATE_HASH + if (node == FUTEX_NO_NODE && futex_key_is_private(key)) { + if (!fph) + fph = rcu_dereference(key->private.mm->futex.phash.hash); + if (fph && fph->hash_mask) { + if (fph_p) + *fph_p = fph; + return __futex_hash_private(key, fph); + } } +#endif - hash = jhash2((u32 *)key, - offsetof(typeof(*key), both.offset) / sizeof(u32), + hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / sizeof(u32), key->both.offset); if (node == FUTEX_NO_NODE) { @@ -441,8 +397,7 @@ __futex_hash(union futex_key *key, struct futex_private_hash *fph) */ node = (hash >> futex_hashshift) % nr_node_ids; if (!node_possible(node)) { - node = find_next_bit_wrap(node_possible_map.bits, - nr_node_ids, node); + node = find_next_bit_wrap(node_possible_map.bits, nr_node_ids, node); } } @@ -459,9 +414,8 @@ __futex_hash(union futex_key *key, struct futex_private_hash *fph) * Return: Initialized hrtimer_sleeper structure or NULL if no timeout * value given */ -struct hrtimer_sleeper * -futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, - int flags, u64 range_ns) +struct hrtimer_sleeper *futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, + int flags, u64 range_ns) { if (!time) return NULL; @@ -829,7 +783,7 @@ void wait_for_owner_exiting(int ret, struct task_struct *exiting) if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) return; - mutex_lock(&exiting->futex_exit_mutex); + mutex_lock(&exiting->futex.exit_mutex); /* * No point in doing state checking here. If the waiter got here * while the task was in exec()->exec_futex_release() then it can @@ -838,7 +792,7 @@ void wait_for_owner_exiting(int ret, struct task_struct *exiting) * already. Highly unlikely and not a problem. Just one more round * through the futex maze. */ - mutex_unlock(&exiting->futex_exit_mutex); + mutex_unlock(&exiting->futex.exit_mutex); put_task_struct(exiting); } @@ -1012,8 +966,9 @@ void futex_unqueue_pi(struct futex_q *q) * dying task, and do notification if so: */ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, - bool pi, bool pending_op) + unsigned int mod, bool pending_op) { + bool pi = !!(mod & FUTEX_ROBUST_MOD_PI); u32 uval, nval, mval; pid_t owner; int err; @@ -1047,7 +1002,7 @@ retry: * * In both cases the following conditions are met: * - * 1) task->robust_list->list_op_pending != NULL + * 1) task->futex.robust_list->list_op_pending != NULL * @pending_op == true * 2) The owner part of user space futex value == 0 * 3) Regular futex: @pi == false @@ -1065,7 +1020,7 @@ retry: owner = uval & FUTEX_TID_MASK; if (pending_op && !pi && !owner) { - futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, + futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, NULL, 1, FUTEX_BITSET_MATCH_ANY); return 0; } @@ -1119,7 +1074,7 @@ retry: * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) { - futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1, + futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, NULL, 1, FUTEX_BITSET_MATCH_ANY); } @@ -1131,31 +1086,30 @@ retry: */ static inline int fetch_robust_entry(struct robust_list __user **entry, struct robust_list __user * __user *head, - unsigned int *pi) + unsigned int *mod) { unsigned long uentry; if (get_user(uentry, (unsigned long __user *)head)) return -EFAULT; - *entry = (void __user *)(uentry & ~1UL); - *pi = uentry & 1; + *entry = (void __user *)(uentry & ~FUTEX_ROBUST_MOD_MASK); + *mod = uentry & FUTEX_ROBUST_MOD_MASK; return 0; } /* - * Walk curr->robust_list (very carefully, it's a userspace list!) + * Walk curr->futex.robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void exit_robust_list(struct task_struct *curr) { - struct robust_list_head __user *head = curr->robust_list; + struct robust_list_head __user *head = curr->futex.robust_list; + unsigned int limit = ROBUST_LIST_LIMIT, cur_mod, next_mod, pend_mod; struct robust_list __user *entry, *next_entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int next_pi; unsigned long futex_offset; int rc; @@ -1163,7 +1117,7 @@ static void exit_robust_list(struct task_struct *curr) * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (fetch_robust_entry(&entry, &head->list.next, &pi)) + if (fetch_robust_entry(&entry, &head->list.next, &cur_mod)) return; /* * Fetch the relative futex offset: @@ -1174,7 +1128,7 @@ static void exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) + if (fetch_robust_entry(&pending, &head->list_op_pending, &pend_mod)) return; next_entry = NULL; /* avoid warning with gcc */ @@ -1183,20 +1137,20 @@ static void exit_robust_list(struct task_struct *curr) * Fetch the next entry in the list before calling * handle_futex_death: */ - rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); + rc = fetch_robust_entry(&next_entry, &entry->next, &next_mod); /* * A pending lock might already be on the list, so * don't process it twice: */ if (entry != pending) { if (handle_futex_death((void __user *)entry + futex_offset, - curr, pi, HANDLE_DEATH_LIST)) + curr, cur_mod, HANDLE_DEATH_LIST)) return; } if (rc) return; entry = next_entry; - pi = next_pi; + cur_mod = next_mod; /* * Avoid excessively long or circular lists: */ @@ -1208,10 +1162,31 @@ static void exit_robust_list(struct task_struct *curr) if (pending) { handle_futex_death((void __user *)pending + futex_offset, - curr, pip, HANDLE_DEATH_PENDING); + curr, pend_mod, HANDLE_DEATH_PENDING); } } +static bool robust_list_clear_pending(unsigned long __user *pop) +{ + struct robust_list_head __user *head = current->futex.robust_list; + + if (!put_user(0UL, pop)) + return true; + + /* + * Just give up. The robust list head is usually part of TLS, so the + * chance that this gets resolved is close to zero. + * + * If @pop_addr is the robust_list_head::list_op_pending pointer then + * clear the robust list head pointer to prevent further damage when the + * task exits. Better a few stale futexes than corrupted memory. But + * that's mostly an academic exercise. + */ + if (pop == (unsigned long __user *)&head->list_op_pending) + current->futex.robust_list = NULL; + return false; +} + #ifdef CONFIG_COMPAT static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) @@ -1227,29 +1202,28 @@ static void __user *futex_uaddr(struct robust_list __user *entry, */ static inline int compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, - compat_uptr_t __user *head, unsigned int *pi) + compat_uptr_t __user *head, unsigned int *pflags) { if (get_user(*uentry, head)) return -EFAULT; - *entry = compat_ptr((*uentry) & ~1); - *pi = (unsigned int)(*uentry) & 1; + *entry = compat_ptr((*uentry) & ~FUTEX_ROBUST_MOD_MASK); + *pflags = (unsigned int)(*uentry) & FUTEX_ROBUST_MOD_MASK; return 0; } /* - * Walk curr->robust_list (very carefully, it's a userspace list!) + * Walk curr->futex.robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * * We silently return on any sign of list-walking problem. */ static void compat_exit_robust_list(struct task_struct *curr) { - struct compat_robust_list_head __user *head = curr->compat_robust_list; + struct compat_robust_list_head __user *head = current->futex.compat_robust_list; + unsigned int limit = ROBUST_LIST_LIMIT, cur_mod, next_mod, pend_mod; struct robust_list __user *entry, *next_entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - unsigned int next_pi; compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; @@ -1258,7 +1232,7 @@ static void compat_exit_robust_list(struct task_struct *curr) * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) + if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &cur_mod)) return; /* * Fetch the relative futex offset: @@ -1269,8 +1243,7 @@ static void compat_exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (compat_fetch_robust_entry(&upending, &pending, - &head->list_op_pending, &pip)) + if (compat_fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pend_mod)) return; next_entry = NULL; /* avoid warning with gcc */ @@ -1280,7 +1253,7 @@ static void compat_exit_robust_list(struct task_struct *curr) * handle_futex_death: */ rc = compat_fetch_robust_entry(&next_uentry, &next_entry, - (compat_uptr_t __user *)&entry->next, &next_pi); + (compat_uptr_t __user *)&entry->next, &next_mod); /* * A pending lock might already be on the list, so * dont process it twice: @@ -1288,15 +1261,14 @@ static void compat_exit_robust_list(struct task_struct *curr) if (entry != pending) { void __user *uaddr = futex_uaddr(entry, futex_offset); - if (handle_futex_death(uaddr, curr, pi, - HANDLE_DEATH_LIST)) + if (handle_futex_death(uaddr, curr, cur_mod, HANDLE_DEATH_LIST)) return; } if (rc) return; uentry = next_uentry; entry = next_entry; - pi = next_pi; + cur_mod = next_mod; /* * Avoid excessively long or circular lists: */ @@ -1308,9 +1280,24 @@ static void compat_exit_robust_list(struct task_struct *curr) if (pending) { void __user *uaddr = futex_uaddr(pending, futex_offset); - handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); + handle_futex_death(uaddr, curr, pend_mod, HANDLE_DEATH_PENDING); } } + +static bool compat_robust_list_clear_pending(u32 __user *pop) +{ + struct compat_robust_list_head __user *head = current->futex.compat_robust_list; + + if (!put_user(0U, pop)) + return true; + + /* See comment in robust_list_clear_pending(). */ + if (pop == &head->list_op_pending) + current->futex.compat_robust_list = NULL; + return false; +} +#else +static bool compat_robust_list_clear_pending(u32 __user *pop_addr) { return false; } #endif #ifdef CONFIG_FUTEX_PI @@ -1322,7 +1309,7 @@ static void compat_exit_robust_list(struct task_struct *curr) */ static void exit_pi_state_list(struct task_struct *curr) { - struct list_head *next, *head = &curr->pi_state_list; + struct list_head *next, *head = &curr->futex.pi_state_list; struct futex_pi_state *pi_state; union futex_key key = FUTEX_KEY_INIT; @@ -1336,7 +1323,7 @@ static void exit_pi_state_list(struct task_struct *curr) * on the mutex. */ WARN_ON(curr != current); - guard(private_hash)(); + guard(private_hash)(current->mm); /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful @@ -1348,7 +1335,8 @@ static void exit_pi_state_list(struct task_struct *curr) pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; if (1) { - CLASS(hb, hb)(&key); + CLASS(hbr, hbr)(&key); + auto hb = hbr.hb; /* * We can race against put_pi_state() removing itself from the @@ -1404,21 +1392,50 @@ static void exit_pi_state_list(struct task_struct *curr) static inline void exit_pi_state_list(struct task_struct *curr) { } #endif +bool futex_robust_list_clear_pending(void __user *pop, unsigned int flags) +{ + bool size32bit = !!(flags & FLAGS_ROBUST_LIST32); + + if (!IS_ENABLED(CONFIG_64BIT) && !size32bit) + return false; + + if (IS_ENABLED(CONFIG_64BIT) && size32bit) + return compat_robust_list_clear_pending(pop); + + return robust_list_clear_pending(pop); +} + +#ifdef CONFIG_FUTEX_ROBUST_UNLOCK +void __futex_fixup_robust_unlock(struct pt_regs *regs, struct futex_unlock_cs_range *csr) +{ + /* + * arch_futex_robust_unlock_get_pop() returns the list pending op pointer from + * @regs if the try_cmpxchg() succeeded. + */ + void __user *pop = arch_futex_robust_unlock_get_pop(regs); + + if (!pop) + return; + + futex_robust_list_clear_pending(pop, csr->pop_size32 ? FLAGS_ROBUST_LIST32 : 0); +} +#endif /* CONFIG_FUTEX_ROBUST_UNLOCK */ + static void futex_cleanup(struct task_struct *tsk) { - if (unlikely(tsk->robust_list)) { + if (unlikely(tsk->futex.robust_list)) { exit_robust_list(tsk); - tsk->robust_list = NULL; + tsk->futex.robust_list = NULL; } #ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) { + if (unlikely(tsk->futex.compat_robust_list)) { compat_exit_robust_list(tsk); - tsk->compat_robust_list = NULL; + tsk->futex.compat_robust_list = NULL; } #endif - if (unlikely(!list_empty(&tsk->pi_state_list))) + if (unlikely(!list_empty(&tsk->futex.pi_state_list))) exit_pi_state_list(tsk); } @@ -1442,23 +1459,23 @@ static void futex_cleanup(struct task_struct *tsk) void futex_exit_recursive(struct task_struct *tsk) { /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ - if (tsk->futex_state == FUTEX_STATE_EXITING) { - __assume_ctx_lock(&tsk->futex_exit_mutex); - mutex_unlock(&tsk->futex_exit_mutex); + if (tsk->futex.state == FUTEX_STATE_EXITING) { + __assume_ctx_lock(&tsk->futex.exit_mutex); + mutex_unlock(&tsk->futex.exit_mutex); } - tsk->futex_state = FUTEX_STATE_DEAD; + tsk->futex.state = FUTEX_STATE_DEAD; } static void futex_cleanup_begin(struct task_struct *tsk) - __acquires(&tsk->futex_exit_mutex) + __acquires(&tsk->futex.exit_mutex) { /* * Prevent various race issues against a concurrent incoming waiter * including live locks by forcing the waiter to block on - * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in + * tsk->futex.exit_mutex when it observes FUTEX_STATE_EXITING in * attach_to_pi_owner(). */ - mutex_lock(&tsk->futex_exit_mutex); + mutex_lock(&tsk->futex.exit_mutex); /* * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. @@ -1472,23 +1489,23 @@ static void futex_cleanup_begin(struct task_struct *tsk) * be observed in exit_pi_state_list(). */ raw_spin_lock_irq(&tsk->pi_lock); - tsk->futex_state = FUTEX_STATE_EXITING; + tsk->futex.state = FUTEX_STATE_EXITING; raw_spin_unlock_irq(&tsk->pi_lock); } static void futex_cleanup_end(struct task_struct *tsk, int state) - __releases(&tsk->futex_exit_mutex) + __releases(&tsk->futex.exit_mutex) { /* * Lockless store. The only side effect is that an observer might * take another loop until it becomes visible. */ - tsk->futex_state = state; + tsk->futex.state = state; /* * Drop the exit protection. This unblocks waiters which observed * FUTEX_STATE_EXITING to reevaluate the state. */ - mutex_unlock(&tsk->futex_exit_mutex); + mutex_unlock(&tsk->futex.exit_mutex); } void futex_exec_release(struct task_struct *tsk) @@ -1516,12 +1533,8 @@ void futex_exit_release(struct task_struct *tsk) futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } -static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, - struct futex_private_hash *fph) +static void futex_hash_bucket_init(struct futex_hash_bucket *fhb) { -#ifdef CONFIG_FUTEX_PRIVATE_HASH - fhb->priv = fph; -#endif atomic_set(&fhb->waiters, 0); plist_head_init(&fhb->chain); spin_lock_init(&fhb->lock); @@ -1553,17 +1566,17 @@ static void __futex_ref_atomic_begin(struct futex_private_hash *fph) * otherwise it would be impossible for it to have reported success * from futex_ref_is_dead(). */ - WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0); + WARN_ON_ONCE(atomic_long_read(&mm->futex.phash.atomic) != 0); /* * Set the atomic to the bias value such that futex_ref_{get,put}() * will never observe 0. Will be fixed up in __futex_ref_atomic_end() * when folding in the percpu count. */ - atomic_long_set(&mm->futex_atomic, LONG_MAX); + atomic_long_set(&mm->futex.phash.atomic, LONG_MAX); smp_store_release(&fph->state, FR_ATOMIC); - call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); + call_rcu_hurry(&mm->futex.phash.rcu, futex_ref_rcu); } static void __futex_ref_atomic_end(struct futex_private_hash *fph) @@ -1584,7 +1597,7 @@ static void __futex_ref_atomic_end(struct futex_private_hash *fph) * Therefore the per-cpu counter is now stable, sum and reset. */ for_each_possible_cpu(cpu) { - unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu); + unsigned int *ptr = per_cpu_ptr(mm->futex.phash.ref, cpu); count += *ptr; *ptr = 0; } @@ -1592,7 +1605,7 @@ static void __futex_ref_atomic_end(struct futex_private_hash *fph) /* * Re-init for the next cycle. */ - this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ + this_cpu_inc(*mm->futex.phash.ref); /* 0 -> 1 */ /* * Add actual count, subtract bias and initial refcount. @@ -1600,7 +1613,7 @@ static void __futex_ref_atomic_end(struct futex_private_hash *fph) * The moment this atomic operation happens, futex_ref_is_dead() can * become true. */ - ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic); + ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex.phash.atomic); if (!ret) wake_up_var(mm); @@ -1610,8 +1623,8 @@ static void __futex_ref_atomic_end(struct futex_private_hash *fph) static void futex_ref_rcu(struct rcu_head *head) { - struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu); - struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash); + struct mm_struct *mm = container_of(head, struct mm_struct, futex.phash.rcu); + struct futex_private_hash *fph = rcu_dereference_raw(mm->futex.phash.hash); if (fph->state == FR_PERCPU) { /* @@ -1640,7 +1653,7 @@ static void futex_ref_drop(struct futex_private_hash *fph) /* * Can only transition the current fph; */ - WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph); + WARN_ON_ONCE(rcu_dereference_raw(mm->futex.phash.hash) != fph); /* * We enqueue at least one RCU callback. Ensure mm stays if the task * exits before the transition is completed. @@ -1651,9 +1664,9 @@ static void futex_ref_drop(struct futex_private_hash *fph) * In order to avoid the following scenario: * * futex_hash() __futex_pivot_hash() - * guard(rcu); guard(mm->futex_hash_lock); - * fph = mm->futex_phash; - * rcu_assign_pointer(&mm->futex_phash, new); + * guard(rcu); guard(mm->futex.phash.lock); + * fph = mm->futex.phash.hash; + * rcu_assign_pointer(&mm->futex.phash.hash, new); * futex_hash_allocate() * futex_ref_drop() * fph->state = FR_ATOMIC; @@ -1668,7 +1681,7 @@ static void futex_ref_drop(struct futex_private_hash *fph) * There must be at least one full grace-period between publishing a * new fph and trying to replace it. */ - if (poll_state_synchronize_rcu(mm->futex_batches)) { + if (poll_state_synchronize_rcu(mm->futex.phash.batches)) { /* * There was a grace-period, we can begin now. */ @@ -1676,7 +1689,7 @@ static void futex_ref_drop(struct futex_private_hash *fph) return; } - call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); + call_rcu_hurry(&mm->futex.phash.rcu, futex_ref_rcu); } static bool futex_ref_get(struct futex_private_hash *fph) @@ -1686,11 +1699,11 @@ static bool futex_ref_get(struct futex_private_hash *fph) guard(preempt)(); if (READ_ONCE(fph->state) == FR_PERCPU) { - __this_cpu_inc(*mm->futex_ref); + __this_cpu_inc(*mm->futex.phash.ref); return true; } - return atomic_long_inc_not_zero(&mm->futex_atomic); + return atomic_long_inc_not_zero(&mm->futex.phash.atomic); } static bool futex_ref_put(struct futex_private_hash *fph) @@ -1700,11 +1713,11 @@ static bool futex_ref_put(struct futex_private_hash *fph) guard(preempt)(); if (READ_ONCE(fph->state) == FR_PERCPU) { - __this_cpu_dec(*mm->futex_ref); + __this_cpu_dec(*mm->futex.phash.ref); return false; } - return atomic_long_dec_and_test(&mm->futex_atomic); + return atomic_long_dec_and_test(&mm->futex.phash.atomic); } static bool futex_ref_is_dead(struct futex_private_hash *fph) @@ -1716,28 +1729,23 @@ static bool futex_ref_is_dead(struct futex_private_hash *fph) if (smp_load_acquire(&fph->state) == FR_PERCPU) return false; - return atomic_long_read(&mm->futex_atomic) == 0; + return atomic_long_read(&mm->futex.phash.atomic) == 0; } -int futex_mm_init(struct mm_struct *mm) +static void futex_hash_init_mm(struct futex_mm_data *fd) { - mutex_init(&mm->futex_hash_lock); - RCU_INIT_POINTER(mm->futex_phash, NULL); - mm->futex_phash_new = NULL; - /* futex-ref */ - mm->futex_ref = NULL; - atomic_long_set(&mm->futex_atomic, 0); - mm->futex_batches = get_state_synchronize_rcu(); - return 0; + memset(&fd->phash, 0, sizeof(fd->phash)); + mutex_init(&fd->phash.lock); + fd->phash.batches = get_state_synchronize_rcu(); } void futex_hash_free(struct mm_struct *mm) { struct futex_private_hash *fph; - free_percpu(mm->futex_ref); - kvfree(mm->futex_phash_new); - fph = rcu_dereference_raw(mm->futex_phash); + free_percpu(mm->futex.phash.ref); + kvfree(mm->futex.phash.hash_new); + fph = rcu_dereference_raw(mm->futex.phash.hash); if (fph) kvfree(fph); } @@ -1748,10 +1756,10 @@ static bool futex_pivot_pending(struct mm_struct *mm) guard(rcu)(); - if (!mm->futex_phash_new) + if (!mm->futex.phash.hash_new) return true; - fph = rcu_dereference(mm->futex_phash); + fph = rcu_dereference(mm->futex.phash.hash); return futex_ref_is_dead(fph); } @@ -1793,7 +1801,7 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) * Once we've disabled the global hash there is no way back. */ scoped_guard(rcu) { - fph = rcu_dereference(mm->futex_phash); + fph = rcu_dereference(mm->futex.phash.hash); if (fph && !fph->hash_mask) { if (custom) return -EBUSY; @@ -1801,15 +1809,15 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) } } - if (!mm->futex_ref) { + if (!mm->futex.phash.ref) { /* * This will always be allocated by the first thread and * therefore requires no locking. */ - mm->futex_ref = alloc_percpu(unsigned int); - if (!mm->futex_ref) + mm->futex.phash.ref = alloc_percpu(unsigned int); + if (!mm->futex.phash.ref) return -ENOMEM; - this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ + this_cpu_inc(*mm->futex.phash.ref); /* 0 -> 1 */ } fph = kvzalloc(struct_size(fph, queues, hash_slots), @@ -1822,7 +1830,7 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) fph->mm = mm; for (i = 0; i < hash_slots; i++) - futex_hash_bucket_init(&fph->queues[i], fph); + futex_hash_bucket_init(&fph->queues[i]); if (custom) { /* @@ -1832,14 +1840,14 @@ again: wait_var_event(mm, futex_pivot_pending(mm)); } - scoped_guard(mutex, &mm->futex_hash_lock) { + scoped_guard(mutex, &mm->futex.phash.lock) { struct futex_private_hash *free __free(kvfree) = NULL; struct futex_private_hash *cur, *new; - cur = rcu_dereference_protected(mm->futex_phash, - lockdep_is_held(&mm->futex_hash_lock)); - new = mm->futex_phash_new; - mm->futex_phash_new = NULL; + cur = rcu_dereference_protected(mm->futex.phash.hash, + lockdep_is_held(&mm->futex.phash.lock)); + new = mm->futex.phash.hash_new; + mm->futex.phash.hash_new = NULL; if (fph) { if (cur && !cur->hash_mask) { @@ -1849,7 +1857,7 @@ again: * the second one returns here. */ free = fph; - mm->futex_phash_new = new; + mm->futex.phash.hash_new = new; return -EBUSY; } if (cur && !new) { @@ -1879,7 +1887,7 @@ again: if (new) { /* - * Will set mm->futex_phash_new on failure; + * Will set mm->futex.phash.new_hash on failure; * futex_private_hash_get() will try again. */ if (!__futex_pivot_hash(mm, new) && custom) @@ -1898,11 +1906,9 @@ int futex_hash_allocate_default(void) return 0; scoped_guard(rcu) { - threads = min_t(unsigned int, - get_nr_threads(current), - num_online_cpus()); + threads = min_t(unsigned int, get_nr_threads(current), num_online_cpus()); - fph = rcu_dereference(current->mm->futex_phash); + fph = rcu_dereference(current->mm->futex.phash.hash); if (fph) { if (fph->custom) return 0; @@ -1929,24 +1935,52 @@ static int futex_hash_get_slots(void) struct futex_private_hash *fph; guard(rcu)(); - fph = rcu_dereference(current->mm->futex_phash); + fph = rcu_dereference(current->mm->futex.phash.hash); if (fph && fph->hash_mask) return fph->hash_mask + 1; return 0; } +#else /* CONFIG_FUTEX_PRIVATE_HASH */ +static inline int futex_hash_allocate(unsigned int hslots, unsigned int flags) { return -EINVAL; } +static inline int futex_hash_get_slots(void) { return 0; } +static inline void futex_hash_init_mm(struct futex_mm_data *fd) { } +#endif /* !CONFIG_FUTEX_PRIVATE_HASH */ -#else +#ifdef CONFIG_FUTEX_ROBUST_UNLOCK +static void futex_invalidate_cs_ranges(struct futex_mm_data *fd) +{ + /* + * Invalidate start_ip so that the quick check fails for ip >= start_ip + * if VDSO is not mapped or the second slot is not available for compat + * tasks as they use VDSO32 which does not provide the 64-bit pointer + * variant. + */ + for (int i = 0; i < FUTEX_ROBUST_MAX_CS_RANGES; i++) + fd->unlock.cs_ranges[i].start_ip = ~0UL; +} -static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) +void futex_reset_cs_ranges(struct futex_mm_data *fd) { - return -EINVAL; + memset(fd->unlock.cs_ranges, 0, sizeof(fd->unlock.cs_ranges)); + futex_invalidate_cs_ranges(fd); } -static int futex_hash_get_slots(void) +static void futex_robust_unlock_init_mm(struct futex_mm_data *fd) { - return 0; + /* mm_dup() preserves the range, mm_alloc() clears it */ + if (!fd->unlock.cs_ranges[0].start_ip) + futex_invalidate_cs_ranges(fd); } +#else /* CONFIG_FUTEX_ROBUST_UNLOCK */ +static inline void futex_robust_unlock_init_mm(struct futex_mm_data *fd) { } +#endif /* !CONFIG_FUTEX_ROBUST_UNLOCK */ +#if defined(CONFIG_FUTEX_PRIVATE_HASH) || defined(CONFIG_FUTEX_ROBUST_UNLOCK) +void futex_mm_init(struct mm_struct *mm) +{ + futex_hash_init_mm(&mm->futex); + futex_robust_unlock_init_mm(&mm->futex); +} #endif int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) @@ -2001,7 +2035,7 @@ static int __init futex_init(void) BUG_ON(!table); for (i = 0; i < hashsize; i++) - futex_hash_bucket_init(&table[i], NULL); + futex_hash_bucket_init(&table[i]); futex_queues[n] = table; } diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 9f6bf6f585fc..f00f0863ed44 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -40,6 +40,8 @@ #define FLAGS_NUMA 0x0080 #define FLAGS_STRICT 0x0100 #define FLAGS_MPOL 0x0200 +#define FLAGS_ROBUST_UNLOCK 0x0400 +#define FLAGS_ROBUST_LIST32 0x0800 /* FUTEX_ to FLAGS_ */ static inline unsigned int futex_to_flags(unsigned int op) @@ -52,6 +54,12 @@ static inline unsigned int futex_to_flags(unsigned int op) if (op & FUTEX_CLOCK_REALTIME) flags |= FLAGS_CLOCKRT; + if (op & FUTEX_ROBUST_UNLOCK) + flags |= FLAGS_ROBUST_UNLOCK; + + if (op & FUTEX_ROBUST_LIST32) + flags |= FLAGS_ROBUST_LIST32; + return flags; } @@ -126,6 +134,15 @@ static inline bool should_fail_futex(bool fshared) } #endif +static inline bool futex_key_is_private(union futex_key *key) +{ + /* + * Relies on get_futex_key() to set either bit for shared + * futexes -- see comment with union futex_key. + */ + return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); +} + /* * Hash buckets are shared by all the futex_keys that hash to the same * location. Each key may have multiple futex_q structures, one for each task @@ -135,7 +152,6 @@ struct futex_hash_bucket { atomic_t waiters; spinlock_t lock; struct plist_head chain; - struct futex_private_hash *priv; } ____cacheline_aligned_in_smp; /* @@ -175,7 +191,7 @@ typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q); * @requeue_pi_key: the requeue_pi target futex key * @bitset: bitset for the optional bitmasked wakeup * @requeue_state: State field for futex_requeue_pi() - * @drop_hb_ref: Waiter should drop the extra hash bucket reference if true + * @drop_fph: Waiter should drop the extra private hash reference when set * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) * * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so @@ -202,7 +218,7 @@ struct futex_q { union futex_key *requeue_pi_key; u32 bitset; atomic_t requeue_state; - bool drop_hb_ref; + struct futex_private_hash *drop_fph; #ifdef CONFIG_PREEMPT_RT struct rcuwait requeue_wait; #endif @@ -222,28 +238,29 @@ extern struct hrtimer_sleeper * futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, int flags, u64 range_ns); -extern struct futex_hash_bucket *futex_hash(union futex_key *key); -#ifdef CONFIG_FUTEX_PRIVATE_HASH -extern void futex_hash_get(struct futex_hash_bucket *hb); -extern void futex_hash_put(struct futex_hash_bucket *hb); +struct futex_bucket_ref { + struct futex_hash_bucket *hb; + struct futex_private_hash *fph; +}; -extern struct futex_private_hash *futex_private_hash(void); +#ifdef CONFIG_FUTEX_PRIVATE_HASH +extern struct futex_private_hash *futex_private_hash(struct mm_struct *mm); extern void futex_private_hash_put(struct futex_private_hash *fph); #else /* !CONFIG_FUTEX_PRIVATE_HASH */ -static inline void futex_hash_get(struct futex_hash_bucket *hb) { } -static inline void futex_hash_put(struct futex_hash_bucket *hb) { } -static inline struct futex_private_hash *futex_private_hash(void) { return NULL; } +static inline struct futex_private_hash *futex_private_hash(struct mm_struct *mm) { return NULL; } static inline void futex_private_hash_put(struct futex_private_hash *fph) { } #endif -DEFINE_CLASS(hb, struct futex_hash_bucket *, - if (_T) futex_hash_put(_T), +extern struct futex_bucket_ref futex_hash(union futex_key *key); + +DEFINE_CLASS(hbr, struct futex_bucket_ref, + if (_T.fph) futex_private_hash_put(_T.fph), futex_hash(key), union futex_key *key); DEFINE_CLASS(private_hash, struct futex_private_hash *, if (_T) futex_private_hash_put(_T), - futex_private_hash(), void); + futex_private_hash(mm), struct mm_struct *mm); /** * futex_match - Check whether two futex keys are equal @@ -449,13 +466,16 @@ extern int futex_unqueue_multiple(struct futex_vector *v, int count); extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to); -extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset); +extern int futex_wake(u32 __user *uaddr, unsigned int flags, void __user *pop, + int nr_wake, u32 bitset); extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op); -extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags); +extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, void __user *pop); extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock); +bool futex_robust_list_clear_pending(void __user *pop, unsigned int flags); + #endif /* _FUTEX_H */ diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index 643199fdbe62..795011ea1202 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -14,7 +14,7 @@ int refill_pi_state_cache(void) { struct futex_pi_state *pi_state; - if (likely(current->pi_state_cache)) + if (likely(current->futex.pi_state_cache)) return 0; pi_state = kzalloc_obj(*pi_state); @@ -28,17 +28,17 @@ int refill_pi_state_cache(void) refcount_set(&pi_state->refcount, 1); pi_state->key = FUTEX_KEY_INIT; - current->pi_state_cache = pi_state; + current->futex.pi_state_cache = pi_state; return 0; } static struct futex_pi_state *alloc_pi_state(void) { - struct futex_pi_state *pi_state = current->pi_state_cache; + struct futex_pi_state *pi_state = current->futex.pi_state_cache; WARN_ON(!pi_state); - current->pi_state_cache = NULL; + current->futex.pi_state_cache = NULL; return pi_state; } @@ -60,7 +60,7 @@ static void pi_state_update_owner(struct futex_pi_state *pi_state, if (new_owner) { raw_spin_lock(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &new_owner->pi_state_list); + list_add(&pi_state->list, &new_owner->futex.pi_state_list); pi_state->owner = new_owner; raw_spin_unlock(&new_owner->pi_lock); } @@ -96,7 +96,7 @@ void put_pi_state(struct futex_pi_state *pi_state) raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); } - if (current->pi_state_cache) { + if (current->futex.pi_state_cache) { kfree(pi_state); } else { /* @@ -106,7 +106,7 @@ void put_pi_state(struct futex_pi_state *pi_state) */ pi_state->owner = NULL; refcount_set(&pi_state->refcount, 1); - current->pi_state_cache = pi_state; + current->futex.pi_state_cache = pi_state; } } @@ -179,7 +179,7 @@ void put_pi_state(struct futex_pi_state *pi_state) * * p->pi_lock: * - * p->pi_state_list -> pi_state->list, relation + * p->futex.pi_state_list -> pi_state->list, relation * pi_mutex->owner -> pi_state->owner, relation * * pi_state->refcount: @@ -327,7 +327,7 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the * caller that the alleged owner is busy. */ - if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) + if (tsk && tsk->futex.state != FUTEX_STATE_DEAD) return -EBUSY; /* @@ -346,8 +346,8 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, * *uaddr = 0xC0000000; tsk = get_task(PID); * } if (!tsk->flags & PF_EXITING) { * ... attach(); - * tsk->futex_state = } else { - * FUTEX_STATE_DEAD; if (tsk->futex_state != + * tsk->futex.state = } else { + * FUTEX_STATE_DEAD; if (tsk->futex.state != * FUTEX_STATE_DEAD) * return -EAGAIN; * return -ESRCH; <--- FAIL @@ -396,7 +396,7 @@ static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, pi_state->key = *key; WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &p->pi_state_list); + list_add(&pi_state->list, &p->futex.pi_state_list); /* * Assignment without holding pi_state->pi_mutex.wait_lock is safe * because there is no concurrency as the object is not published yet. @@ -440,7 +440,7 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, * in futex_exit_release(), we do this protected by p->pi_lock: */ raw_spin_lock_irq(&p->pi_lock); - if (unlikely(p->futex_state != FUTEX_STATE_OK)) { + if (unlikely(p->futex.state != FUTEX_STATE_OK)) { /* * The task is on the way out. When the futex state is * FUTEX_STATE_DEAD, we know that the task has finished @@ -945,7 +945,8 @@ retry: retry_private: if (1) { - CLASS(hb, hb)(&q.key); + CLASS(hbr, hbr)(&q.key); + auto hb = hbr.hb; futex_q_lock(&q, hb); @@ -1009,7 +1010,7 @@ retry_private: * the thread, performing resize, will block on hb->lock during * the requeue. */ - futex_hash_put(no_free_ptr(hb)); + futex_private_hash_put(no_free_ptr(hbr.fph)); /* * Must be done before we enqueue the waiter, here is unfortunately * under the hb lock, but that *should* work because it does nothing. @@ -1100,11 +1101,9 @@ no_block: __release(&hb->lock); futex_unqueue_pi(&q); spin_unlock(q.lock_ptr); - if (q.drop_hb_ref) { - CLASS(hb, hb)(&q.key); - /* Additional reference from futex_unlock_pi() */ - futex_hash_put(hb); - } + + /* Additional reference from futex_unlock_pi() */ + futex_private_hash_put(q.drop_fph); goto out; out_unlock_put_key: @@ -1139,7 +1138,7 @@ out: * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. */ -int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) +static int __futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { u32 curval, uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; @@ -1148,7 +1147,6 @@ int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) if (!IS_ENABLED(CONFIG_FUTEX_PI)) return -ENOSYS; - retry: if (get_user(uval, uaddr)) return -EFAULT; @@ -1162,7 +1160,8 @@ retry: if (ret) return ret; - CLASS(hb, hb)(&key); + CLASS(hbr, hbr)(&key); + auto hb = hbr.hb; spin_lock(&hb->lock); retry_hb: @@ -1219,8 +1218,9 @@ retry_hb: * Acquire a reference for the leaving waiter to ensure * valid futex_q::lock_ptr. */ - futex_hash_get(hb); - top_waiter->drop_hb_ref = true; + if (futex_key_is_private(&key)) + top_waiter->drop_fph = futex_private_hash(key.private.mm); + __futex_unqueue(top_waiter); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); goto retry_hb; @@ -1302,3 +1302,15 @@ pi_faulted: return ret; } +int futex_unlock_pi(u32 __user *uaddr, unsigned int flags, void __user *pop) +{ + int ret = __futex_unlock_pi(uaddr, flags); + + if (ret || !(flags & FLAGS_ROBUST_UNLOCK)) + return ret; + + if (!futex_robust_list_clear_pending(pop, flags)) + return -EFAULT; + + return 0; +} diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index b597cb3d17fc..7384672916fb 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -241,8 +241,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * Acquire a reference for the waiter to ensure valid * futex_q::lock_ptr. */ - futex_hash_get(hb); - q->drop_hb_ref = true; + if (futex_key_is_private(key)) + q->drop_fph = futex_private_hash(key->private.mm); q->lock_ptr = &hb->lock; task = READ_ONCE(q->task); @@ -459,8 +459,10 @@ retry: retry_private: if (1) { - CLASS(hb, hb1)(&key1); - CLASS(hb, hb2)(&key2); + CLASS(hbr, hbr1)(&key1); + CLASS(hbr, hbr2)(&key2); + auto hb1 = hbr1.hb; + auto hb2 = hbr2.hb; futex_hb_waiters_inc(hb2); double_lock_hb(hb1, hb2); @@ -643,6 +645,12 @@ retry_private: continue; } + /* Self-deadlock: non-top waiter already owns the PI futex. */ + if (rt_mutex_owner(&pi_state->pi_mutex) == this->task) { + ret = -EDEADLK; + break; + } + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, this->task); @@ -832,7 +840,8 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, switch (futex_requeue_pi_wakeup_sync(&q)) { case Q_REQUEUE_PI_IGNORE: { - CLASS(hb, hb)(&q.key); + CLASS(hbr, hbr)(&q.key); + auto hb = hbr.hb; /* The waiter is still on uaddr1 */ spin_lock(&hb->lock); ret = handle_early_requeue_pi_wakeup(hb, &q, to); @@ -902,11 +911,8 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, default: BUG(); } - if (q.drop_hb_ref) { - CLASS(hb, hb)(&q.key); - /* Additional reference from requeue_pi_wake_futex() */ - futex_hash_put(hb); - } + /* Additional reference from requeue_pi_wake_futex() */ + futex_private_hash_put(q.drop_fph); out: if (to) { diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 77ad9691f6a6..2fa19d9d008d 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -25,17 +25,13 @@ * @head: pointer to the list-head * @len: length of the list-head, as userspace expects */ -SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, - size_t, len) +SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) { - /* - * The kernel knows only one size for now: - */ + /* The kernel knows only one size for now. */ if (unlikely(len != sizeof(*head))) return -EINVAL; - current->robust_list = head; - + current->futex.robust_list = head; return 0; } @@ -43,9 +39,9 @@ static inline void __user *futex_task_robust_list(struct task_struct *p, bool co { #ifdef CONFIG_COMPAT if (compat) - return p->compat_robust_list; + return p->futex.compat_robust_list; #endif - return p->robust_list; + return p->futex.robust_list; } static void __user *futex_get_robust_list_common(int pid, bool compat) @@ -122,6 +118,13 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, return -ENOSYS; } + if (flags & FLAGS_ROBUST_UNLOCK) { + if (cmd != FUTEX_WAKE && + cmd != FUTEX_WAKE_BITSET && + cmd != FUTEX_UNLOCK_PI) + return -ENOSYS; + } + switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; @@ -132,7 +135,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; case FUTEX_WAKE_BITSET: - return futex_wake(uaddr, flags, val, val3); + return futex_wake(uaddr, flags, uaddr2, val, val3); case FUTEX_REQUEUE: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0); case FUTEX_CMP_REQUEUE: @@ -145,7 +148,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_LOCK_PI2: return futex_lock_pi(uaddr, flags, timeout, 0); case FUTEX_UNLOCK_PI: - return futex_unlock_pi(uaddr, flags); + return futex_unlock_pi(uaddr, flags, uaddr2); case FUTEX_TRYLOCK_PI: return futex_lock_pi(uaddr, flags, NULL, 1); case FUTEX_WAIT_REQUEUE_PI: @@ -379,7 +382,7 @@ SYSCALL_DEFINE4(futex_wake, if (!futex_validate_input(flags, mask)) return -EINVAL; - return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask); + return futex_wake(uaddr, FLAGS_STRICT | flags, NULL, nr, mask); } /* @@ -475,15 +478,13 @@ SYSCALL_DEFINE4(futex_requeue, } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(set_robust_list, - struct compat_robust_list_head __user *, head, - compat_size_t, len) +COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, + compat_size_t, len) { if (unlikely(len != sizeof(*head))) return -EINVAL; - current->compat_robust_list = head; - + current->futex.compat_robust_list = head; return 0; } @@ -523,4 +524,3 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */ - diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index ceed9d879059..d4483d15d30a 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -150,12 +150,35 @@ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) } /* + * If requested, clear the robust list pending op and unlock the futex + */ +static bool futex_robust_unlock(u32 __user *uaddr, unsigned int flags, void __user *pop) +{ + if (!(flags & FLAGS_ROBUST_UNLOCK)) + return true; + + /* First unlock the futex, which requires release semantics. */ + scoped_user_write_access(uaddr, efault) + unsafe_atomic_store_release_user(0, uaddr, efault); + + /* + * Clear the pending list op now. If that fails, then the task is in + * deeper trouble as the robust list head is usually part of the TLS. + * The chance of survival is close to zero. + */ + return futex_robust_list_clear_pending(pop, flags); + +efault: + return false; +} + +/* * Wake up waiters matching bitset queued on this futex (uaddr). */ -int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) +int futex_wake(u32 __user *uaddr, unsigned int flags, void __user *pop, int nr_wake, u32 bitset) { - struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; + struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); int ret; @@ -166,10 +189,14 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if (unlikely(ret != 0)) return ret; + if (!futex_robust_unlock(uaddr, flags, pop)) + return -EFAULT; + if ((flags & FLAGS_STRICT) && !nr_wake) return 0; - CLASS(hb, hb)(&key); + CLASS(hbr, hbr)(&key); + auto hb = hbr.hb; /* Make sure we really have tasks to wakeup */ if (!futex_hb_waiters_pending(hb)) @@ -266,8 +293,10 @@ retry: retry_private: if (1) { - CLASS(hb, hb1)(&key1); - CLASS(hb, hb2)(&key2); + CLASS(hbr, hbr1)(&key1); + CLASS(hbr, hbr2)(&key2); + auto hb1 = hbr1.hb; + auto hb2 = hbr2.hb; double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); @@ -409,7 +438,7 @@ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) * Make sure to have a reference on the private_hash such that we * don't block on rehash after changing the task state below. */ - guard(private_hash)(); + guard(private_hash)(current->mm); /* * Enqueuing multiple futexes is tricky, because we need to enqueue @@ -446,7 +475,8 @@ retry: u32 val = vs[i].w.val; if (1) { - CLASS(hb, hb)(&q->key); + CLASS(hbr, hbr)(&q->key); + auto hb = hbr.hb; futex_q_lock(q, hb); ret = futex_get_value_locked(&uval, uaddr); @@ -621,7 +651,8 @@ retry: retry_private: if (1) { - CLASS(hb, hb)(&q->key); + CLASS(hbr, hbr)(&q->key); + auto hb = hbr.hb; futex_q_lock(q, hb); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6c9b1dc4e7d4..de754db414d1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/irqdomain.h> +#include <linux/preempt.h> #include <linux/random.h> #include <trace/events/irq.h> @@ -47,9 +48,11 @@ int irq_set_chip(unsigned int irq, const struct irq_chip *chip) scoped_irqdesc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip); ret = 0; } - /* For !CONFIG_SPARSE_IRQ make the irq show up in allocated_irqs. */ - if (!ret) + if (!ret) { + /* For !CONFIG_SPARSE_IRQ make the irq show up in allocated_irqs. */ irq_mark_irq(irq); + irq_proc_update_chip(chip); + } return ret; } EXPORT_SYMBOL(irq_set_chip); @@ -893,7 +896,10 @@ void handle_percpu_irq(struct irq_desc *desc) * * action->percpu_dev_id is a pointer to percpu variables which * contain the real device id for the cpu on which this handler is - * called + * called. + * + * May be used for NMI interrupt lines, and so may be called in IRQ or NMI + * context. */ void handle_percpu_devid_irq(struct irq_desc *desc) { @@ -930,7 +936,8 @@ void handle_percpu_devid_irq(struct irq_desc *desc) enabled ? " and unmasked" : "", irq, cpu); } - add_interrupt_randomness(irq); + if (!in_nmi()) + add_interrupt_randomness(irq); if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); @@ -1007,6 +1014,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, WARN_ON(irq_chip_pm_get(irq_desc_get_irq_data(desc))); irq_activate_and_startup(desc, IRQ_RESEND); } + irq_proc_update_valid(desc); } void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, @@ -1067,6 +1075,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) trigger = tmp; irqd_set(&desc->irq_data, trigger); + irq_proc_update_valid(desc); } } EXPORT_SYMBOL_GPL(irq_modify_status); diff --git a/kernel/irq/debugfs.h b/kernel/irq/debugfs.h new file mode 100644 index 000000000000..8a9360d5fefb --- /dev/null +++ b/kernel/irq/debugfs.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_IRQ_DEBUGFS_H +#define _KERNEL_IRQ_DEBUGFS_H + +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +#include <linux/debugfs.h> + +struct irq_bit_descr { + unsigned int mask; + char *name; +}; + +#define BIT_MASK_DESCR(m) { .mask = m, .name = #m } + +void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state, + const struct irq_bit_descr *sd, int size); + +void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); +static inline void irq_remove_debugfs_entry(struct irq_desc *desc) +{ + debugfs_remove(desc->debugfs_file); + kfree(desc->dev_name); +} +void irq_debugfs_copy_devname(int irq, struct device *dev); +# ifdef CONFIG_IRQ_DOMAIN +void irq_domain_debugfs_init(struct dentry *root); +# else +static inline void irq_domain_debugfs_init(struct dentry *root) +{ +} +# endif +#else /* CONFIG_GENERIC_IRQ_DEBUGFS */ +static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) +{ +} +static inline void irq_remove_debugfs_entry(struct irq_desc *d) +{ +} +static inline void irq_debugfs_copy_devname(int irq, struct device *dev) +{ +} +#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */ + +#endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 9412e57056f5..0ce21dd45404 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -9,8 +9,12 @@ #include <linux/irqdesc.h> #include <linux/kernel_stat.h> #include <linux/pm_runtime.h> +#include <linux/rcuref.h> #include <linux/sched/clock.h> +#include "debugfs.h" +#include "proc.h" + #ifdef CONFIG_SPARSE_IRQ # define MAX_SPARSE_IRQS INT_MAX #else @@ -21,6 +25,7 @@ extern bool noirqdebug; extern int irq_poll_cpu; +extern unsigned int total_nr_irqs; extern struct irqaction chained_action; @@ -100,9 +105,23 @@ extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); #ifdef CONFIG_SPARSE_IRQ -static inline void irq_mark_irq(unsigned int irq) { } +static __always_inline void irq_mark_irq(unsigned int irq) { } +void irq_desc_free_rcu(struct irq_desc *desc); + +static __always_inline bool irq_desc_get_ref(struct irq_desc *desc) +{ + return rcuref_get(&desc->refcnt); +} + +static __always_inline void irq_desc_put_ref(struct irq_desc *desc) +{ + if (rcuref_put(&desc->refcnt)) + irq_desc_free_rcu(desc); +} #else extern void irq_mark_irq(unsigned int irq); +static __always_inline bool irq_desc_get_ref(struct irq_desc *desc) { return true; } +static __always_inline void irq_desc_put_ref(struct irq_desc *desc) { } #endif irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); @@ -122,6 +141,7 @@ extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); +void irq_proc_update_valid(struct irq_desc *desc); #else static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { } @@ -129,8 +149,11 @@ static inline void register_handler_proc(unsigned int irq, struct irqaction *action) { } static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } +static inline void irq_proc_update_valid(struct irq_desc *desc) { } #endif +struct irq_desc *irq_find_desc_at_or_after(unsigned int offset); + extern bool irq_can_set_affinity_usr(unsigned int irq); extern int irq_do_set_affinity(struct irq_data *data, @@ -171,7 +194,7 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) __DEFINE_CLASS_IS_CONDITIONAL(irqdesc_lock, true); __DEFINE_UNLOCK_GUARD(irqdesc_lock, struct irq_desc, - __irq_put_desc_unlock(_T->lock, _T->flags, _T->bus), + if (_T->lock) __irq_put_desc_unlock(_T->lock, _T->flags, _T->bus), unsigned long flags; bool bus); static inline class_irqdesc_lock_t class_irqdesc_lock_constructor(unsigned int irq, bool bus, @@ -372,42 +395,3 @@ static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd) return NULL; #endif } - -#ifdef CONFIG_GENERIC_IRQ_DEBUGFS -#include <linux/debugfs.h> - -struct irq_bit_descr { - unsigned int mask; - char *name; -}; - -#define BIT_MASK_DESCR(m) { .mask = m, .name = #m } - -void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state, - const struct irq_bit_descr *sd, int size); - -void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc); -static inline void irq_remove_debugfs_entry(struct irq_desc *desc) -{ - debugfs_remove(desc->debugfs_file); - kfree(desc->dev_name); -} -void irq_debugfs_copy_devname(int irq, struct device *dev); -# ifdef CONFIG_IRQ_DOMAIN -void irq_domain_debugfs_init(struct dentry *root); -# else -static inline void irq_domain_debugfs_init(struct dentry *root) -{ -} -# endif -#else /* CONFIG_GENERIC_IRQ_DEBUGFS */ -static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d) -{ -} -static inline void irq_remove_debugfs_entry(struct irq_desc *d) -{ -} -static inline void irq_debugfs_copy_devname(int irq, struct device *dev) -{ -} -#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 7173b8b634f2..80ef4e27dcf4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -137,17 +137,18 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->tot_count = 0; desc->name = NULL; desc->owner = owner; + rcuref_init(&desc->refcnt, 1); desc_smp_init(desc, node, affinity); } -static unsigned int nr_irqs = NR_IRQS; +unsigned int total_nr_irqs __read_mostly = NR_IRQS; /** * irq_get_nr_irqs() - Number of interrupts supported by the system. */ unsigned int irq_get_nr_irqs(void) { - return nr_irqs; + return total_nr_irqs; } EXPORT_SYMBOL_GPL(irq_get_nr_irqs); @@ -157,13 +158,12 @@ EXPORT_SYMBOL_GPL(irq_get_nr_irqs); * * Return: @nr. */ -unsigned int irq_set_nr_irqs(unsigned int nr) +unsigned int __init irq_set_nr_irqs(unsigned int nr) { - nr_irqs = nr; - + total_nr_irqs = nr; + irq_proc_calc_prec(); return nr; } -EXPORT_SYMBOL_GPL(irq_set_nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs, @@ -181,15 +181,12 @@ static int irq_find_free_area(unsigned int from, unsigned int cnt) return mas.index; } -static unsigned int irq_find_at_or_after(unsigned int offset) +struct irq_desc *irq_find_desc_at_or_after(unsigned int offset) { unsigned long index = offset; - struct irq_desc *desc; - - guard(rcu)(); - desc = mt_find(&sparse_irqs, &index, nr_irqs); - return desc ? irq_desc_get_irq(desc) : nr_irqs; + lockdep_assert_in_rcu_read_lock(); + return mt_find(&sparse_irqs, &index, total_nr_irqs); } static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) @@ -466,6 +463,17 @@ static void delayed_free_desc(struct rcu_head *rhp) kobject_put(&desc->kobj); } +void irq_desc_free_rcu(struct irq_desc *desc) +{ + /* + * We free the descriptor, masks and stat fields via RCU. That + * allows demultiplex interrupts to do rcu based management of + * the child interrupts. + * This also allows us to use rcu in kstat_irqs_usr(). + */ + call_rcu(&desc->rcu, delayed_free_desc); +} + static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -484,14 +492,7 @@ static void free_desc(unsigned int irq) */ irq_sysfs_del(desc); delete_irq_desc(irq); - - /* - * We free the descriptor, masks and stat fields via RCU. That - * allows demultiplex interrupts to do rcu based management of - * the child interrupts. - * This also allows us to use rcu in kstat_irqs_usr(). - */ - call_rcu(&desc->rcu, delayed_free_desc); + irq_desc_put_ref(desc); } static int alloc_descs(unsigned int start, unsigned int cnt, int node, @@ -543,7 +544,8 @@ static bool irq_expand_nr_irqs(unsigned int nr) { if (nr > MAX_SPARSE_IRQS) return false; - nr_irqs = nr; + total_nr_irqs = nr; + irq_proc_calc_prec(); return true; } @@ -557,21 +559,22 @@ int __init early_irq_init(void) /* Let arch update nr_irqs and return the nr of preallocated irqs */ initcnt = arch_probe_nr_irqs(); printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", - NR_IRQS, nr_irqs, initcnt); + NR_IRQS, total_nr_irqs, initcnt); - if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS)) - nr_irqs = MAX_SPARSE_IRQS; + if (WARN_ON(total_nr_irqs > MAX_SPARSE_IRQS)) + total_nr_irqs = MAX_SPARSE_IRQS; if (WARN_ON(initcnt > MAX_SPARSE_IRQS)) initcnt = MAX_SPARSE_IRQS; - if (initcnt > nr_irqs) - nr_irqs = initcnt; + if (initcnt > total_nr_irqs) + total_nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { desc = alloc_desc(i, node, 0, NULL, NULL); irq_insert_desc(i, desc); } + irq_proc_calc_prec(); return arch_early_irq_init(); } @@ -592,7 +595,7 @@ int __init early_irq_init(void) init_irq_default_affinity(); - printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); + pr_info("NR_IRQS: %d\n", NR_IRQS); count = ARRAY_SIZE(irq_desc); @@ -602,6 +605,7 @@ int __init early_irq_init(void) goto __free_desc_res; } + irq_proc_calc_prec(); return arch_early_irq_init(); __free_desc_res: @@ -862,7 +866,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt) { int i; - if (from >= nr_irqs || (from + cnt) > nr_irqs) + if (from >= total_nr_irqs || (from + cnt) > total_nr_irqs) return; guard(mutex)(&sparse_irq_lock); @@ -911,7 +915,7 @@ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int no if (irq >=0 && start != irq) return -EEXIST; - if (start + cnt > nr_irqs) { + if (start + cnt > total_nr_irqs) { if (!irq_expand_nr_irqs(start + cnt)) return -ENOMEM; } @@ -923,11 +927,15 @@ EXPORT_SYMBOL_GPL(__irq_alloc_descs); * irq_get_next_irq - get next allocated irq number * @offset: where to start the search * - * Returns next irq number after offset or nr_irqs if none is found. + * Returns next irq number after offset or total_nr_irqs if none is found. */ unsigned int irq_get_next_irq(unsigned int offset) { - return irq_find_at_or_after(offset); + struct irq_desc *desc; + + guard(rcu)(); + desc = irq_find_desc_at_or_after(offset); + return desc ? irq_desc_get_irq(desc) : total_nr_irqs; } struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cc93abf009e8..f15c9f1223bb 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -20,6 +20,8 @@ #include <linux/smp.h> #include <linux/fs.h> +#include "proc.h" + static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); @@ -1532,6 +1534,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip); irq_data->chip_data = chip_data; + irq_proc_update_chip(chip); return 0; } EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); @@ -2081,7 +2084,7 @@ static void irq_domain_free_one_irq(struct irq_domain *domain, unsigned int virq #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #ifdef CONFIG_GENERIC_IRQ_DEBUGFS -#include "internals.h" +#include "debugfs.h" static struct dentry *domain_dir; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2e8072437826..7eb07e3bdb4c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1802,6 +1802,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) __enable_irq(desc); } + irq_proc_update_valid(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); mutex_unlock(&desc->request_mutex); @@ -1906,6 +1907,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) desc->affinity_hint = NULL; #endif + irq_proc_update_valid(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); /* * Drop bus_lock here so the changes which were done in the chip @@ -2026,24 +2028,32 @@ const void *free_irq(unsigned int irq, void *dev_id) } EXPORT_SYMBOL(free_irq); -/* This function must be called with desc->lock held */ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) { + struct irqaction *action = NULL; const char *devname = NULL; - desc->istate &= ~IRQS_NMI; + scoped_guard(raw_spinlock_irqsave, &desc->lock) { + irq_nmi_teardown(desc); - if (!WARN_ON(desc->action == NULL)) { - irq_pm_remove_action(desc, desc->action); - devname = desc->action->name; - unregister_handler_proc(irq, desc->action); + desc->istate &= ~IRQS_NMI; - kfree(desc->action); + if (!WARN_ON(desc->action == NULL)) { + action = desc->action; + irq_pm_remove_action(desc, action); + devname = action->name; + } desc->action = NULL; + + irq_settings_clr_disable_unlazy(desc); + irq_shutdown_and_deactivate(desc); } - irq_settings_clr_disable_unlazy(desc); - irq_shutdown_and_deactivate(desc); + irq_proc_update_valid(desc); + + if (action) + unregister_handler_proc(irq, action); + kfree(action); irq_release_resources(desc); @@ -2067,8 +2077,6 @@ const void *free_nmi(unsigned int irq, void *dev_id) if (WARN_ON(desc->depth == 0)) disable_nmi_nosync(irq); - guard(raw_spinlock_irqsave)(&desc->lock); - irq_nmi_teardown(desc); return __cleanup_nmi(irq, desc); } @@ -2318,13 +2326,14 @@ int request_nmi(unsigned int irq, irq_handler_t handler, /* Setup NMI state */ desc->istate |= IRQS_NMI; retval = irq_nmi_setup(desc); - if (retval) { - __cleanup_nmi(irq, desc); - return -EINVAL; - } - return 0; } + if (retval) { + __cleanup_nmi(irq, desc); + return -EINVAL; + } + return 0; + err_irq_setup: irq_chip_pm_put(&desc->irq_data); err_out: @@ -2428,8 +2437,10 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ *action_ptr = action->next; /* Demote from NMI if we killed the last action */ - if (!desc->action) + if (!desc->action) { desc->istate &= ~IRQS_NMI; + irq_proc_update_valid(desc); + } } unregister_handler_proc(irq, action); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index b0999a4f1f68..1b835725f7b1 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -10,6 +10,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/interrupt.h> +#include <linux/kernel.h> #include <linux/kernel_stat.h> #include <linux/mutex.h> #include <linux/string.h> @@ -326,7 +327,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) #undef MAX_NAMELEN -#define MAX_NAMELEN 10 +#define MAX_NAMELEN 11 void register_irq_proc(unsigned int irq, struct irq_desc *desc) { @@ -348,7 +349,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) return; /* create /proc/irq/1234 */ - sprintf(name, "%u", irq); + snprintf(name, MAX_NAMELEN, "%u", irq); desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) return; @@ -401,7 +402,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) #endif remove_proc_entry("spurious", desc->dir); - sprintf(name, "%u", irq); + snprintf(name, MAX_NAMELEN, "%u", irq); remove_proc_entry(name, root_irq_dir); } @@ -439,77 +440,159 @@ void init_irq_proc(void) register_irq_proc(irq, desc); } +void irq_proc_update_valid(struct irq_desc *desc) +{ + u32 set = _IRQ_PROC_VALID; + + if (irq_settings_is_hidden(desc) || irq_desc_is_chained(desc) || !desc->action) + set = 0; + + irq_settings_update_proc_valid(desc, set); +} + #ifdef CONFIG_GENERIC_IRQ_SHOW +#define ARCH_PROC_IRQDESC ((void *)0x00001111) + int __weak arch_show_interrupts(struct seq_file *p, int prec) { return 0; } +static DEFINE_RAW_SPINLOCK(irq_proc_constraints_lock); + +static struct irq_proc_constraints { + bool print_header; + unsigned int num_prec; + unsigned int chip_width; +} irq_proc_constraints __read_mostly = { + .num_prec = 4, + .chip_width = 8, +}; + #ifndef ACTUAL_NR_IRQS -# define ACTUAL_NR_IRQS irq_get_nr_irqs() +# define ACTUAL_NR_IRQS total_nr_irqs #endif -int show_interrupts(struct seq_file *p, void *v) +void irq_proc_calc_prec(void) { - const unsigned int nr_irqs = irq_get_nr_irqs(); - static int prec; + unsigned int prec, n; - int i = *(loff_t *) v, j; - struct irqaction *action; - struct irq_desc *desc; + for (prec = 4, n = 10000; prec < 10 && n <= total_nr_irqs; ++prec) + n *= 10; + + guard(raw_spinlock_irqsave)(&irq_proc_constraints_lock); + if (prec > irq_proc_constraints.num_prec) + WRITE_ONCE(irq_proc_constraints.num_prec, prec); +} + +void irq_proc_update_chip(const struct irq_chip *chip) +{ + unsigned int len = chip && chip->name ? strlen(chip->name) : 0; + + if (!len || len <= READ_ONCE(irq_proc_constraints.chip_width)) + return; + + /* Can be invoked from interrupt disabled contexts */ + guard(raw_spinlock_irqsave)(&irq_proc_constraints_lock); + if (len > irq_proc_constraints.chip_width) + WRITE_ONCE(irq_proc_constraints.chip_width, len); +} + +/* Same as seq_put_decimal_ull_width(p, " ", cnt, 10) */ +#define ZSTR1 " 0" +#define ZSTR1_LEN (sizeof(ZSTR1) - 1) +#define ZSTR16 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 \ + ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 ZSTR1 +#define ZSTR256 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 \ + ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 ZSTR16 + +static inline void irq_proc_emit_zero_counts(struct seq_file *p, unsigned int zeros) +{ + if (!zeros) + return; + + for (unsigned int n = min(zeros, 256); n; zeros -= n, n = min(zeros, 256)) + seq_write(p, ZSTR256, n * ZSTR1_LEN); +} + +static inline unsigned int irq_proc_emit_count(struct seq_file *p, unsigned int cnt, + unsigned int zeros) +{ + if (!cnt) + return zeros + 1; - if (i > ACTUAL_NR_IRQS) - return 0; + irq_proc_emit_zero_counts(p, zeros); + seq_put_decimal_ull_width(p, " ", cnt, 10); + return 0; +} - if (i == ACTUAL_NR_IRQS) - return arch_show_interrupts(p, prec); +void irq_proc_emit_counts(struct seq_file *p, unsigned int __percpu *cnts) +{ + unsigned int cpu, zeros = 0; - /* print header and calculate the width of the first column */ - if (i == 0) { - for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) - j *= 10; + for_each_online_cpu(cpu) + zeros = irq_proc_emit_count(p, per_cpu(*cnts, cpu), zeros); + irq_proc_emit_zero_counts(p, zeros); +} - seq_printf(p, "%*s", prec + 8, ""); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d", j); +static int irq_seq_show(struct seq_file *p, void *v) +{ + struct irq_proc_constraints *constr = p->private; + struct irq_desc *desc = v; + struct irqaction *action; + + /* Print header for the first interrupt? */ + if (constr->print_header) { + unsigned int cpu; + + seq_printf(p, "%*s", constr->num_prec + 8, ""); + for_each_online_cpu(cpu) + seq_printf(p, "CPU%-8d", cpu); seq_putc(p, '\n'); + constr->print_header = false; } - guard(rcu)(); - desc = irq_to_desc(i); - if (!desc || irq_settings_is_hidden(desc)) - return 0; + if (desc == ARCH_PROC_IRQDESC) + return arch_show_interrupts(p, constr->num_prec); - if (!desc->action || irq_desc_is_chained(desc) || !desc->kstat_irqs) - return 0; + seq_put_decimal_ull_width(p, "", irq_desc_get_irq(desc), constr->num_prec); + seq_putc(p, ':'); - seq_printf(p, "%*d:", prec, i); - for_each_online_cpu(j) { - unsigned int cnt = desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0; + /* + * Always output per CPU interrupts. Output device interrupts only when + * desc::tot_count is not zero. + */ + if (irq_settings_is_per_cpu(desc) || irq_settings_is_per_cpu_devid(desc) || + data_race(desc->tot_count)) + irq_proc_emit_counts(p, &desc->kstat_irqs->cnt); + else + irq_proc_emit_zero_counts(p, num_online_cpus()); - seq_put_decimal_ull_width(p, " ", cnt, 10); - } - seq_putc(p, ' '); + /* Enforce a visual gap */ + seq_write(p, " ", 2); guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.chip) { if (desc->irq_data.chip->irq_print_chip) desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); else if (desc->irq_data.chip->name) - seq_printf(p, "%8s", desc->irq_data.chip->name); + seq_printf(p, "%-*s", constr->chip_width, desc->irq_data.chip->name); else - seq_printf(p, "%8s", "-"); + seq_printf(p, "%-*s", constr->chip_width, "-"); } else { - seq_printf(p, "%8s", "None"); + seq_printf(p, "%-*s", constr->chip_width, "None"); } + + seq_putc(p, ' '); if (desc->irq_data.domain) - seq_printf(p, " %*lu", prec, desc->irq_data.hwirq); + seq_put_decimal_ull_width(p, "", desc->irq_data.hwirq, constr->num_prec); else - seq_printf(p, " %*s", prec, ""); -#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL - seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); -#endif + seq_printf(p, " %*s", constr->num_prec, ""); + + if (IS_ENABLED(CONFIG_GENERIC_IRQ_SHOW_LEVEL)) + seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); + if (desc->name) seq_printf(p, "-%-8s", desc->name); @@ -523,4 +606,73 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); return 0; } + +static void *irq_seq_next_desc(loff_t *pos) +{ + if (*pos > total_nr_irqs) + return NULL; + + guard(rcu)(); + for (;;) { + struct irq_desc *desc = irq_find_desc_at_or_after((unsigned int) *pos); + + if (desc) { + *pos = irq_desc_get_irq(desc); + /* + * If valid for output then try to acquire a reference + * count on the descriptor so that it can't be freed + * after dropping RCU read lock on return. + */ + if (irq_settings_proc_valid(desc) && irq_desc_get_ref(desc)) + return desc; + (*pos)++; + } else { + *pos = total_nr_irqs; + return ARCH_PROC_IRQDESC; + } + } +} + +static void *irq_seq_start(struct seq_file *f, loff_t *pos) +{ + if (!*pos) { + struct irq_proc_constraints *constr = f->private; + + constr->num_prec = READ_ONCE(irq_proc_constraints.num_prec); + constr->chip_width = READ_ONCE(irq_proc_constraints.chip_width); + constr->print_header = true; + } + return irq_seq_next_desc(pos); +} + +static void *irq_seq_next(struct seq_file *f, void *v, loff_t *pos) +{ + if (v && v != ARCH_PROC_IRQDESC) + irq_desc_put_ref(v); + + (*pos)++; + return irq_seq_next_desc(pos); +} + +static void irq_seq_stop(struct seq_file *f, void *v) +{ + if (v && v != ARCH_PROC_IRQDESC) + irq_desc_put_ref(v); +} + +static const struct seq_operations irq_seq_ops = { + .start = irq_seq_start, + .next = irq_seq_next, + .stop = irq_seq_stop, + .show = irq_seq_show, +}; + +static int __init irq_proc_init(void) +{ + proc_create_seq_private("interrupts", 0, NULL, &irq_seq_ops, + sizeof(irq_proc_constraints), NULL); + return 0; +} +fs_initcall(irq_proc_init); + #endif diff --git a/kernel/irq/proc.h b/kernel/irq/proc.h new file mode 100644 index 000000000000..0631d57fbfb7 --- /dev/null +++ b/kernel/irq/proc.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KERNEL_IRQ_PROC_H +#define _KERNEL_IRQ_PROC_H + +#if defined(CONFIG_PROC_FS) && defined(CONFIG_GENERIC_IRQ_SHOW) +void irq_proc_calc_prec(void); +void irq_proc_update_chip(const struct irq_chip *chip); +#else +static inline void irq_proc_calc_prec(void) { } +static inline void irq_proc_update_chip(const struct irq_chip *chip) { } +#endif + +#endif diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 00b3bd127692..0a0c027a5d34 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -18,6 +18,7 @@ enum { _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, _IRQ_HIDDEN = IRQ_HIDDEN, _IRQ_NO_DEBUG = IRQ_NO_DEBUG, + _IRQ_PROC_VALID = IRQ_RESERVED, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -34,6 +35,7 @@ enum { #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON #define IRQ_HIDDEN GOT_YOU_MORON #define IRQ_NO_DEBUG GOT_YOU_MORON +#define IRQ_RESERVED GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -180,3 +182,14 @@ static inline bool irq_settings_no_debug(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_NO_DEBUG; } + +static inline bool irq_settings_proc_valid(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_PROC_VALID; +} + +static inline void irq_settings_update_proc_valid(struct irq_desc *desc, u32 set) +{ + desc->status_use_accessors &= ~_IRQ_PROC_VALID; + desc->status_use_accessors |= (set & _IRQ_PROC_VALID); +} diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 120fd7365fbe..f7e2dc2c30c6 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -292,6 +292,12 @@ void irq_work_sync(struct irq_work *work) !arch_irq_work_has_interrupt()) { rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), TASK_UNINTERRUPTIBLE); + /* + * Ensure irq_work_single() does not access @work + * after removing IRQ_WORK_BUSY. It is always + * accessed within a RCU-read section. + */ + synchronize_rcu(); return; } @@ -302,6 +308,7 @@ EXPORT_SYMBOL_GPL(irq_work_sync); static void run_irq_workd(unsigned int cpu) { + guard(rcu)(); irq_work_run_list(this_cpu_ptr(&lazy_list)); } diff --git a/kernel/kthread.c b/kernel/kthread.c index 791210daf8b4..63beb59b7a3d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1619,7 +1619,6 @@ void kthread_use_mm(struct mm_struct *mm) WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); - WARN_ON_ONCE(!mm->user_ns); /* * It is possible for mm to be the same as tsk->active_mm, but diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 18509d8082ea..1b592d86dc48 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -357,20 +357,6 @@ int kho_radix_walk_tree(struct kho_radix_tree *tree, } EXPORT_SYMBOL_GPL(kho_radix_walk_tree); -static void __kho_unpreserve(struct kho_radix_tree *tree, - unsigned long pfn, unsigned long end_pfn) -{ - unsigned int order; - - while (pfn < end_pfn) { - order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - - kho_radix_del_page(tree, pfn, order); - - pfn += 1 << order; - } -} - /* For physically contiguous 0-order pages. */ static void kho_init_pages(struct page *page, unsigned long nr_pages) { @@ -860,6 +846,37 @@ void kho_unpreserve_folio(struct folio *folio) } EXPORT_SYMBOL_GPL(kho_unpreserve_folio); +static unsigned int __kho_preserve_pages_order(unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned int order = min(count_trailing_zeros(start_pfn), + ilog2(end_pfn - start_pfn)); + + /* + * Make sure all the pages in a single preservation are in the same NUMA + * node. The restore machinery can not cope with a preservation spanning + * multiple NUMA nodes. + */ + while (pfn_to_nid(start_pfn) != pfn_to_nid(start_pfn + (1UL << order) - 1)) + order--; + + return order; +} + +static void __kho_unpreserve(struct kho_radix_tree *tree, + unsigned long pfn, unsigned long end_pfn) +{ + unsigned int order; + + while (pfn < end_pfn) { + order = __kho_preserve_pages_order(pfn, end_pfn); + + kho_radix_del_page(tree, pfn, order); + + pfn += 1 << order; + } +} + /** * kho_preserve_pages - preserve contiguous pages across kexec * @page: first page in the list. @@ -885,16 +902,7 @@ int kho_preserve_pages(struct page *page, unsigned long nr_pages) } while (pfn < end_pfn) { - unsigned int order = - min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); - - /* - * Make sure all the pages in a single preservation are in the - * same NUMA node. The restore machinery can not cope with a - * preservation spanning multiple NUMA nodes. - */ - while (pfn_to_nid(pfn) != pfn_to_nid(pfn + (1UL << order) - 1)) - order--; + unsigned int order = __kho_preserve_pages_order(pfn, end_pfn); err = kho_radix_add_page(tree, pfn, order); if (err) { @@ -1707,7 +1715,7 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_enable) + if (!kho_enable || image->type == KEXEC_TYPE_CRASH) return 0; image->kho.fdt = virt_to_phys(kho_out.fdt); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 09534628dc01..8a85912d7ee6 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -763,6 +763,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas raw_spin_lock_irqsave(&lock->wait_lock, flags); raw_spin_lock(¤t->blocked_lock); __set_task_blocked_on(current, lock); + set_current_state(state); if (opt_acquired) break; @@ -980,9 +981,8 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) __releases(lock) { - struct task_struct *next = NULL; + struct task_struct *donor, *next = NULL; struct mutex_waiter *waiter; - DEFINE_WAKE_Q(wake_q); unsigned long owner; unsigned long flags; @@ -990,6 +990,14 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne __release(lock); /* + * Ensures the proxy donor stack is stable across unlock and handoff. + * Specifically, it avoids the case where current->blocked_donor is + * NULL when it is inspected while doing the unlock, but a preemption + * before taking the wake_lock would make it set and a hand-off is + * missed. + */ + guard(preempt)(); + /* * Release the lock before (potentially) taking the spinlock such that * other contenders can get on with things ASAP. * @@ -1001,6 +1009,12 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne MUTEX_WARN_ON(__owner_task(owner) != current); MUTEX_WARN_ON(owner & MUTEX_FLAG_PICKUP); + if (sched_proxy_exec() && current->blocked_donor) { + /* force handoff if we have a blocked_donor */ + owner = MUTEX_FLAG_HANDOFF; + break; + } + if (owner & MUTEX_FLAG_HANDOFF) break; @@ -1013,20 +1027,56 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne } raw_spin_lock_irqsave(&lock->wait_lock, flags); + raw_spin_lock(¤t->blocked_lock); debug_mutex_unlock(lock); + + if (sched_proxy_exec()) { + /* + * If we have a task boosting current, and that task was boosting + * current through this lock, hand the lock to that task, as that + * is the highest waiter, as selected by the scheduling function. + */ + donor = current->blocked_donor; + if (donor) { + struct mutex *next_lock; + + raw_spin_lock_nested(&donor->blocked_lock, SINGLE_DEPTH_NESTING); + next_lock = __get_task_blocked_on(donor); + if (next_lock == lock) { + next = get_task_struct(donor); + __clear_task_blocked_on(next, lock); + current->blocked_donor = NULL; + } + raw_spin_unlock(&donor->blocked_lock); + } + } + + /* + * Failing that, pick first on the wait list. + */ waiter = lock->first_waiter; - if (waiter) { - next = waiter->task; + if (!next && waiter) { + next = get_task_struct(waiter->task); + raw_spin_lock_nested(&next->blocked_lock, SINGLE_DEPTH_NESTING); debug_mutex_wake_waiter(lock, waiter); - set_task_blocked_on_waking(next, lock); - wake_q_add(&wake_q, next); + __clear_task_blocked_on(next, lock); + raw_spin_unlock(&next->blocked_lock); + } + if (trace_contended_release_enabled() && waiter) + trace_call__contended_release(lock); + if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); + raw_spin_unlock(¤t->blocked_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + if (next) { + wake_up_process(next); + put_task_struct(next); + } } #ifndef CONFIG_DEBUG_LOCK_ALLOC @@ -1220,6 +1270,7 @@ EXPORT_SYMBOL(ww_mutex_lock_interruptible); EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin); EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(contended_release); /** * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index ef234469baac..f7e152c40d6d 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -263,6 +263,9 @@ void percpu_up_write(struct percpu_rw_semaphore *sem) { rwsem_release(&sem->dep_map, _RET_IP_); + if (trace_contended_release_enabled() && wq_has_sleeper(&sem->waiters)) + trace_call__contended_release(sem); + /* * Signal the writer is done, no fast path yet. * @@ -288,3 +291,29 @@ void percpu_up_write(struct percpu_rw_semaphore *sem) rcu_sync_exit(&sem->rss); } EXPORT_SYMBOL_GPL(percpu_up_write); + +void __percpu_up_read(struct percpu_rw_semaphore *sem) +{ + lockdep_assert_preemption_disabled(); + /* + * After percpu_up_write() completes, rcu_sync_is_idle() can still + * return false during the grace period, forcing readers into this + * slowpath. Only trace when a writer is actually waiting for + * readers to drain. + */ + if (trace_contended_release_enabled() && rcuwait_active(&sem->writer)) + trace_call__contended_release(sem); + /* + * slowpath; reader will only ever wake a single blocked + * writer. + */ + smp_mb(); /* B matches C */ + /* + * In other words, if they see our decrement (presumably to + * aggregate zero, as that is the only time it matters) they + * will also see our critical section. + */ + this_cpu_dec(*sem->read_count); + rcuwait_wake_up(&sem->writer); +} +EXPORT_SYMBOL_GPL(__percpu_up_read); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4f386ea6c792..4728631ae719 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -484,6 +484,7 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod static __always_inline void rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { lockdep_assert_held(&lock->wait_lock); @@ -492,6 +493,7 @@ rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) static __always_inline void rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { lockdep_assert_held(&lock->wait_lock); @@ -1092,6 +1094,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, static int __sched try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { lockdep_assert_held(&lock->wait_lock); @@ -1319,6 +1322,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, */ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { struct rt_mutex_waiter *waiter; @@ -1466,6 +1470,7 @@ static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock) raw_spin_lock_irqsave(&lock->wait_lock, flags); } + trace_contended_release(lock); /* * The wakeup next waiter path does not suffer from the above * race. See the comments there. @@ -1558,6 +1563,9 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, lockdep_assert_held(&lock->wait_lock); + if (!waiter_task) /* never enqueued */ + return; + scoped_guard(raw_spinlock, &waiter_task->pi_lock) { rt_mutex_dequeue(lock, waiter); waiter_task->pi_blocked_on = NULL; diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 124219aea46e..5d48d64725b1 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -41,6 +41,7 @@ static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, unsigned int state, struct lockdep_map *nest_lock, unsigned int subclass) + __cond_acquires(0, lock) { int ret; @@ -67,13 +68,27 @@ EXPORT_SYMBOL(rt_mutex_base_init); */ void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass); + if (__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass) == 0) + return; + /* + * The code below is never reached because __rt_mutex_lock_common() only + * returns an error code if interrupted by a signal or upon a timeout. + */ + WARN_ON_ONCE(true); + __acquire(lock); } EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0); + if (__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0) == 0) + return; + /* + * The code below is never reached because __rt_mutex_lock_common() only + * returns an error code if interrupted by a signal or upon a timeout. + */ + WARN_ON_ONCE(true); + __acquire(lock); } EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock); @@ -86,7 +101,14 @@ EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock); */ void __sched rt_mutex_lock(struct rt_mutex *lock) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0); + if (__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0) == 0) + return; + /* + * The code below is never reached because __rt_mutex_lock_common() only + * returns an error code if interrupted by a signal or upon a timeout. + */ + WARN_ON_ONCE(true); + __acquire(lock); } EXPORT_SYMBOL_GPL(rt_mutex_lock); #endif @@ -157,6 +179,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) { mutex_release(&lock->dep_map, _RET_IP_); __rt_mutex_unlock(&lock->rtmutex); + __release(lock); } EXPORT_SYMBOL_GPL(rt_mutex_unlock); @@ -182,6 +205,7 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex_base *lock) */ bool __sched __rt_mutex_futex_unlock(struct rt_mutex_base *lock, struct rt_wake_q_head *wqh) + __must_hold(&lock->wait_lock) { lockdep_assert_held(&lock->wait_lock); @@ -312,6 +336,7 @@ int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { int ret; @@ -365,7 +390,7 @@ int __sched rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, raw_spin_lock_irq(&lock->wait_lock); ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q); - if (unlikely(ret)) + if (unlikely(ret < 0)) remove_waiter(lock, waiter); preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 82e078c0665a..2835c9ef9b3f 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -174,6 +174,8 @@ static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, unsigned int state) { + if (trace_contended_release_enabled() && rt_mutex_owner(&rwb->rtmutex)) + trace_call__contended_release(rwb); /* * rwb->readers can only hit 0 when a writer is waiting for the * active readers to leave the critical section. @@ -205,6 +207,8 @@ static inline void rwbase_write_unlock(struct rwbase_rt *rwb) unsigned long flags; raw_spin_lock_irqsave(&rtm->wait_lock, flags); + if (trace_contended_release_enabled() && rt_mutex_has_waiters(rtm)) + trace_call__contended_release(rwb); __rwbase_write_unlock(rwb, WRITER_BIAS, flags); } @@ -214,6 +218,8 @@ static inline void rwbase_write_downgrade(struct rwbase_rt *rwb) unsigned long flags; raw_spin_lock_irqsave(&rtm->wait_lock, flags); + if (trace_contended_release_enabled() && rt_mutex_has_waiters(rtm)) + trace_call__contended_release(rwb); /* Release it and account current as reader */ __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags); } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index bf647097369c..b9c180ac1eee 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1387,6 +1387,8 @@ static inline void __up_read(struct rw_semaphore *sem) rwsem_clear_reader_owned(sem); tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); + if (trace_contended_release_enabled() && (tmp & RWSEM_FLAG_WAITERS)) + trace_call__contended_release(sem); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == RWSEM_FLAG_WAITERS)) { clear_nonspinnable(sem); @@ -1413,8 +1415,10 @@ static inline void __up_write(struct rw_semaphore *sem) preempt_disable(); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); - if (unlikely(tmp & RWSEM_FLAG_WAITERS)) + if (unlikely(tmp & RWSEM_FLAG_WAITERS)) { + trace_contended_release(sem); rwsem_wake(sem); + } preempt_enable(); } @@ -1437,8 +1441,10 @@ static inline void __downgrade_write(struct rw_semaphore *sem) tmp = atomic_long_fetch_add_release( -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); rwsem_set_reader_owned(sem); - if (tmp & RWSEM_FLAG_WAITERS) + if (tmp & RWSEM_FLAG_WAITERS) { + trace_contended_release(sem); rwsem_downgrade_wake(sem); + } preempt_enable(); } diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 74d41433ba13..233730c25933 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -230,6 +230,10 @@ void __sched up(struct semaphore *sem) sem->count++; else __up(sem, &wake_q); + + if (trace_contended_release_enabled() && !wake_q_empty(&wake_q)) + trace_call__contended_release(sem); + raw_spin_unlock_irqrestore(&sem->lock, flags); if (!wake_q_empty(&wake_q)) wake_up_q(&wake_q); diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 6c12452097e1..d62b49b53ec3 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -324,7 +324,7 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, * blocked_on to PROXY_WAKING. Otherwise we can see * circular blocked_on relationships that can't resolve. */ - set_task_blocked_on_waking(waiter->task, lock); + clear_task_blocked_on(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -383,7 +383,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * are waking the mutex owner, who may be currently * blocked on a different mutex. */ - set_task_blocked_on_waking(owner, NULL); + clear_task_blocked_on(owner, NULL); wake_q_add(wake_q, owner); } return true; diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index 36f52a232a12..cce098671be9 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -307,6 +307,8 @@ int module_decompress(struct load_info *info, const void *buf, size_t size) */ n_pages = DIV_ROUND_UP(size, PAGE_SIZE) * 2; error = module_extend_max_pages(info, n_pages); + if (error) + return error; data_size = MODULE_DECOMPRESS_FN(info, buf, size); if (data_size < 0) { diff --git a/kernel/panic.c b/kernel/panic.c index 20feada5319d..213725b612aa 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -39,6 +39,7 @@ #include <linux/sys_info.h> #include <trace/events/error_report.h> #include <asm/sections.h> +#include <kunit/test-bug.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -1124,6 +1125,11 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, bool rcu = warn_rcu_enter(); struct warn_args args; + if (kunit_is_suppressed_warning(true)) { + warn_rcu_exit(rcu); + return; + } + pr_warn(CUT_HERE); if (!fmt) { @@ -1146,6 +1152,11 @@ void __warn_printk(const char *fmt, ...) bool rcu = warn_rcu_enter(); va_list args; + if (kunit_is_suppressed_warning(false)) { + warn_rcu_exit(rcu); + return; + } + pr_warn(CUT_HERE); va_start(args, fmt); diff --git a/kernel/params.c b/kernel/params.c index 74d620bc2521..a668863a4bb6 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -942,9 +942,9 @@ const struct kobj_type module_ktype = { /* * param_sysfs_init - create "module" kset * - * This must be done before the initramfs is unpacked and - * request_module() thus becomes possible, because otherwise the - * module load would fail in mod_sysfs_init. + * This must be done before any driver registration so that when a driver comes + * from a built-in module, the driver core can add the module under /sys/module + * and create the associated driver symlinks. */ static int __init param_sysfs_init(void) { @@ -957,7 +957,7 @@ static int __init param_sysfs_init(void) return 0; } -subsys_initcall(param_sysfs_init); +pure_initcall(param_sysfs_init); /* * param_sysfs_builtin_init - add sysfs version and parameter diff --git a/kernel/pid.c b/kernel/pid.c index fd5c2d4aa349..f55189a3d07d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -885,10 +885,12 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) if (ret) return ERR_PTR(ret); - if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) - file = fget_task(task, fd); - else + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) file = ERR_PTR(-EPERM); + else if (task->flags & PF_EXITING) + file = ERR_PTR(-ESRCH); + else + file = fget_task(task, fd); up_read(&task->signal->exec_update_lock); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 05337f437cca..530c897311d4 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -42,6 +42,7 @@ config HIBERNATION select CRC32 select CRYPTO select CRYPTO_LZO + select CRYPTO_LZ4 help Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index af8d07bafe02..d2479c69d71a 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -392,23 +392,6 @@ static int create_image(int platform_mode) return error; } -static void shrink_shmem_memory(void) -{ - struct sysinfo info; - unsigned long nr_shmem_pages, nr_freed_pages; - - si_meminfo(&info); - nr_shmem_pages = info.sharedram; /* current page count used for shmem */ - /* - * The intent is to reclaim all shmem pages. Though shrink_all_memory() can - * only reclaim about half of them, it's enough for creating the hibernation - * image. - */ - nr_freed_pages = shrink_all_memory(nr_shmem_pages); - pr_debug("requested to reclaim %lu shmem pages, actually freed %lu pages\n", - nr_shmem_pages, nr_freed_pages); -} - /** * hibernation_snapshot - Quiesce devices and create a hibernation image. * @platform_mode: If set, use platform driver to prepare for the transition. @@ -425,14 +408,9 @@ int hibernation_snapshot(int platform_mode) if (error) goto Close; - /* Preallocate image memory before shutting down devices. */ - error = hibernate_preallocate_memory(); - if (error) - goto Close; - error = freeze_kernel_threads(); if (error) - goto Cleanup; + goto Close; if (hibernation_test(TEST_FREEZER)) { @@ -445,19 +423,13 @@ int hibernation_snapshot(int platform_mode) } error = dpm_prepare(PMSG_FREEZE); - if (error) { - dpm_complete(PMSG_RECOVER); - goto Thaw; - } + if (error) + goto Complete; - /* - * Device drivers may move lots of data to shmem in dpm_prepare(). The shmem - * pages will use lots of system memory, causing hibernation image creation - * fail due to insufficient free memory. - * This call is to force flush the shmem pages to swap disk and reclaim - * the system memory so that image creation can succeed. - */ - shrink_shmem_memory(); + /* Preallocate image memory before shutting down devices. */ + error = hibernate_preallocate_memory(); + if (error) + goto Complete; console_suspend_all(); pm_restrict_gfp_mask(); @@ -492,10 +464,10 @@ int hibernation_snapshot(int platform_mode) platform_end(platform_mode); return error; + Complete: + dpm_complete(PMSG_RECOVER); Thaw: thaw_kernel_threads(); - Cleanup: - swsusp_free(); goto Close; } diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 398b994b73aa..1944dbeb0d4c 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -519,18 +519,23 @@ static int __init cpu_latency_qos_init(void) int ret; ret = misc_register(&cpu_latency_qos_miscdev); - if (ret < 0) + if (ret < 0) { pr_err("%s: %s setup failed\n", __func__, cpu_latency_qos_miscdev.name); + return ret; + } #ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP ret = misc_register(&cpu_wakeup_latency_qos_miscdev); - if (ret < 0) + if (ret < 0) { pr_err("%s: %s setup failed\n", __func__, cpu_wakeup_latency_qos_miscdev.name); + misc_deregister(&cpu_latency_qos_miscdev); + return ret; + } #endif - return ret; + return 0; } late_initcall(cpu_latency_qos_init); #endif /* CONFIG_CPU_IDLE */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 2e64869bb5a0..b28233b8d00e 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -570,29 +570,23 @@ struct crc_data { wait_queue_head_t done; /* crc update done */ u32 *crc32; /* points to handle's crc32 */ size_t **unc_len; /* uncompressed lengths */ - unsigned char **unc; /* uncompressed data */ + unsigned char *unc[]; /* uncompressed data */ }; static struct crc_data *alloc_crc_data(int nr_threads) { struct crc_data *crc; - crc = kzalloc_obj(*crc); + crc = kzalloc_flex(*crc, unc, nr_threads); if (!crc) return NULL; - crc->unc = kcalloc(nr_threads, sizeof(*crc->unc), GFP_KERNEL); - if (!crc->unc) - goto err_free_crc; - crc->unc_len = kzalloc_objs(*crc->unc_len, nr_threads); if (!crc->unc_len) - goto err_free_unc; + goto err_free_crc; return crc; -err_free_unc: - kfree(crc->unc); err_free_crc: kfree(crc); return NULL; @@ -607,7 +601,6 @@ static void free_crc_data(struct crc_data *crc) kthread_stop(crc->thr); kfree(crc->unc_len); - kfree(crc->unc); kfree(crc); } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 68c17daef8d4..d041645d9d17 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -13,6 +13,7 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> +#include <linux/sched/exec_state.h> #include <linux/sched/task.h> #include <linux/errno.h> #include <linux/mm.h> @@ -36,6 +37,30 @@ #include <asm/syscall.h> /* for syscall_get_* */ +/** + * ptracer_access_allowed - may current peek/poke @tsk's address space? + * @tsk: tracee + * + * Per-access check used by ptrace_access_vm() and architecture-specific + * tag/register accessors. Returns true iff current is the registered + * ptracer of @tsk and either @tsk is owner-dumpable or current holds + * CAP_SYS_PTRACE in @tsk's exec namespace. Lighter than + * __ptrace_may_access(): it re-validates only dumpability and + * capability on every access, without re-running LSM hooks or + * cred_cap_issubset() checks performed at attach time. + */ +bool ptracer_access_allowed(struct task_struct *tsk) +{ + const struct task_exec_state *es; + + guard(rcu)(); + if (ptrace_parent(tsk) != current) + return false; + es = task_exec_state_rcu(tsk); + return READ_ONCE(es->dumpable) == TASK_DUMPABLE_OWNER || + ptracer_capable(tsk, es->user_ns); +} + /* * Access another process' address space via ptrace. * Source/target buffer must be kernel space, @@ -45,21 +70,14 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; - int ret; + int ret = 0; mm = get_task_mm(tsk); if (!mm) return 0; - if (!tsk->ptrace || - (current != tsk->parent) || - ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptracer_capable(tsk, mm->user_ns))) { - mmput(mm); - return 0; - } - - ret = access_remote_vm(mm, addr, buf, len, gup_flags); + if (ptracer_access_allowed(tsk)) + ret = access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; @@ -272,11 +290,21 @@ static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode) return ns_capable(ns, CAP_SYS_PTRACE); } +static bool task_still_dumpable(struct task_struct *task, unsigned int mode) +{ + const struct task_exec_state *exec_state; + + guard(rcu)(); + exec_state = task_exec_state_rcu(task); + if (READ_ONCE(exec_state->dumpable) == TASK_DUMPABLE_OWNER) + return true; + return ptrace_has_cap(exec_state->user_ns, mode); +} + /* Returns 0 on success, -errno on denial. */ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; - struct mm_struct *mm; kuid_t caller_uid; kgid_t caller_gid; @@ -337,11 +365,8 @@ ok: * Pairs with a write barrier in commit_creds(). */ smp_rmb(); - mm = task->mm; - if (mm && - ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(mm->user_ns, mode))) - return -EPERM; + if (!task_still_dumpable(task, mode)) + return -EPERM; return security_ptrace_access_check(task, mode); } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 5f2848b828dc..882a158ada7b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -572,7 +572,7 @@ static unsigned long rcu_no_completed(void) static void rcu_torture_deferred_free(struct rcu_torture *p) { - call_rcu_hurry(&p->rtort_rcu, rcu_torture_cb); + call_rcu(&p->rtort_rcu, rcu_torture_cb); } static void rcu_sync_torture_init(void) @@ -619,7 +619,7 @@ static struct rcu_torture_ops rcu_ops = { .poll_gp_state_exp = poll_state_synchronize_rcu, .cond_sync_exp = cond_synchronize_rcu_expedited, .cond_sync_exp_full = cond_synchronize_rcu_expedited_full, - .call = call_rcu_hurry, + .call = call_rcu, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, .gp_kthread_dbg = show_rcu_gp_kthreads, @@ -1145,7 +1145,7 @@ static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) static void synchronize_rcu_mult_test(void) { - synchronize_rcu_mult(call_rcu_tasks, call_rcu_hurry); + synchronize_rcu_mult(call_rcu_tasks, call_rcu); } static struct rcu_torture_ops tasks_ops = { @@ -1632,6 +1632,17 @@ static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void)) } /* + * Do an rcu_barrier() to motivate lazy callbacks during a stutter + * pause. Without this, we can get false-positives rtort_pipe_count + * splats. + */ +static void rcu_torture_writer_work(struct work_struct *work) +{ + if (cur_ops->cb_barrier) + cur_ops->cb_barrier(); +} + +/* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure * after a series of grace periods (the "pipeline"). @@ -1651,6 +1662,7 @@ rcu_torture_writer(void *arg) int i; int idx; unsigned long j; + struct work_struct lazy_work; int oldnice = task_nice(current); struct rcu_gp_oldstate *rgo = NULL; int rgo_size = 0; @@ -1703,6 +1715,9 @@ rcu_torture_writer(void *arg) pr_alert("%s" TORTURE_FLAG " Waited %lu jiffies for boot to complete.\n", torture_type, jiffies - j); + if (IS_ENABLED(CONFIG_RCU_LAZY)) + INIT_WORK_ONSTACK(&lazy_work, rcu_torture_writer_work); + do { rcu_torture_writer_state = RTWS_FIXED_DELAY; torture_hrtimeout_us(500, 1000, &rand); @@ -1895,6 +1910,8 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; + if (IS_ENABLED(CONFIG_RCU_LAZY)) + queue_work(system_percpu_wq, &lazy_work); stutter_waited = stutter_wait("rcu_torture_writer"); if (stutter_waited && !atomic_read(&rcu_fwd_cb_nodelay) && @@ -1925,6 +1942,12 @@ rcu_torture_writer(void *arg) pr_alert("%s" TORTURE_FLAG " Dynamic grace-period expediting was disabled.\n", torture_type); + + if (IS_ENABLED(CONFIG_RCU_LAZY)) { + cancel_work_sync(&lazy_work); + destroy_work_on_stack(&lazy_work); + } + kfree(ulo); kfree(rgo); rcu_torture_writer_state = RTWS_STOPPING; diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0d01cd8c4b4a..7c2f7cc131f7 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -897,11 +897,9 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp { int cpu; - for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - if (!(mask & (1UL << (cpu - snp->grplo)))) - continue; - srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); - } + for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) + if ((mask & (1UL << (cpu - snp->grplo))) && rcu_cpu_beenfullyonline(cpu)) + srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); } /* @@ -1322,7 +1320,9 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, */ idx = __srcu_read_lock_nmisafe(ssp); ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state); - if (ss_state < SRCU_SIZE_WAIT_CALL) + // If !rcu_cpu_beenfullyonline(), interrupts are still disabled, + // so no migration is possible in either direction from this CPU. + if (ss_state < SRCU_SIZE_WAIT_CALL || !rcu_cpu_beenfullyonline(raw_smp_processor_id())) sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); else sdp = raw_cpu_ptr(ssp->sda); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 48f0d803c8e2..f4da5fad70f5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -373,7 +373,8 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, // Queuing callbacks before initialization not yet supported. if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist))) rcu_segcblist_init(&rtpcp->cblist); - needwake = (func == wakeme_after_rcu) || + needwake = (!havekthread && rcu_segcblist_empty(&rtpcp->cblist)) || + (func == wakeme_after_rcu) || (rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim); if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) { if (rtp->lazy_jiffies) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 55df6d37145e..03a43d3d2616 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -492,7 +492,7 @@ static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param int ret = kstrtoul(val, 0, &j); if (!ret) { - WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1)); + WRITE_ONCE(*(ulong *)kp->arg, clamp_val(j, 1, HZ)); adjust_jiffies_till_sched_qs(); } return ret; @@ -969,14 +969,11 @@ static int rcu_watching_snap_recheck(struct rcu_data *rdp) if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) { int cpu = rdp->cpu; struct rcu_snap_record *rsrp; - struct kernel_cpustat *kcsp; - - kcsp = &kcpustat_cpu(cpu); rsrp = &rdp->snap_record; - rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); - rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); - rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); + rsrp->cputime_irq = kcpustat_field(CPUTIME_IRQ, cpu); + rsrp->cputime_softirq = kcpustat_field(CPUTIME_SOFTIRQ, cpu); + rsrp->cputime_system = kcpustat_field(CPUTIME_SYSTEM, cpu); rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu) + arch_irq_stat_cpu(cpu); rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu); rsrp->nr_csw = nr_context_switches_cpu(cpu); @@ -1632,17 +1629,21 @@ static void rcu_sr_put_wait_head(struct llist_node *node) atomic_set_release(&sr_wn->inuse, 0); } -/* Enable rcu_normal_wake_from_gp automatically on small systems. */ -#define WAKE_FROM_GP_CPU_THRESHOLD 16 - -static int rcu_normal_wake_from_gp = -1; +static int rcu_normal_wake_from_gp = 1; module_param(rcu_normal_wake_from_gp, int, 0644); static struct workqueue_struct *sync_wq; +#define RCU_SR_NORMAL_LATCH_THR 64 + +/* Number of in-flight synchronize_rcu() calls queued on srs_next. */ +static atomic_long_t rcu_sr_normal_count; +static int rcu_sr_normal_latched; /* 0/1 */ + static void rcu_sr_normal_complete(struct llist_node *node) { struct rcu_synchronize *rs = container_of( (struct rcu_head *) node, struct rcu_synchronize, head); + long nr; WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && !poll_state_synchronize_rcu_full(&rs->oldstate), @@ -1650,6 +1651,15 @@ static void rcu_sr_normal_complete(struct llist_node *node) /* Finally. */ complete(&rs->completion); + nr = atomic_long_dec_return(&rcu_sr_normal_count); + WARN_ON_ONCE(nr < 0); + + /* + * Unlatch: switch back to normal path when fully + * drained and if it has been latched. + */ + if (nr == 0) + (void)cmpxchg_relaxed(&rcu_sr_normal_latched, 1, 0); } static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) @@ -1795,6 +1805,24 @@ static bool rcu_sr_normal_gp_init(void) static void rcu_sr_normal_add_req(struct rcu_synchronize *rs) { + /* + * Increment before publish to avoid a complete + * vs enqueue race on latch. + */ + long nr = atomic_long_inc_return(&rcu_sr_normal_count); + + /* + * Latch when threshold is reached. Checking for an exact match + * restricts cmpxchg() to a single context. + * + * This latch is intentionally relaxed and best-effort. Concurrent + * set/clear can race and temporarily lose the latch, which is OK + * because it only selects between the fast and fallback paths. + */ + if (nr == RCU_SR_NORMAL_LATCH_THR) + (void)cmpxchg_relaxed(&rcu_sr_normal_latched, 0, 1); + + /* Publish for the GP kthread/worker. */ llist_add((struct llist_node *) &rs->head, &rcu_state.srs_next); } @@ -2584,7 +2612,7 @@ static void rcu_do_batch(struct rcu_data *rdp) const long npj = NSEC_PER_SEC / HZ; long rrn = READ_ONCE(rcu_resched_ns); - rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; + rrn = clamp(rrn, NSEC_PER_MSEC, NSEC_PER_SEC); tlimit = local_clock() + rrn; jlimit = jiffies + (rrn + npj + 1) / npj; jlimit_check = true; @@ -3278,14 +3306,15 @@ static void synchronize_rcu_normal(void) { struct rcu_synchronize rs; + init_rcu_head_on_stack(&rs.head); trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request")); - if (READ_ONCE(rcu_normal_wake_from_gp) < 1) { + if (READ_ONCE(rcu_normal_wake_from_gp) < 1 || + READ_ONCE(rcu_sr_normal_latched)) { wait_rcu_gp(call_rcu_hurry); goto trace_complete_out; } - init_rcu_head_on_stack(&rs.head); init_completion(&rs.completion); /* @@ -3302,10 +3331,10 @@ static void synchronize_rcu_normal(void) /* Now we can wait. */ wait_for_completion(&rs.completion); - destroy_rcu_head_on_stack(&rs.head); trace_complete_out: trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("complete")); + destroy_rcu_head_on_stack(&rs.head); } /** @@ -4904,12 +4933,6 @@ void __init rcu_init(void) sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); WARN_ON(!sync_wq); - /* Respect if explicitly disabled via a boot parameter. */ - if (rcu_normal_wake_from_gp < 0) { - if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD) - rcu_normal_wake_from_gp = 1; - } - /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ if (qovld < 0) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 1047b30cd46b..373b877cf171 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -655,7 +655,7 @@ static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) * No-CBs GP kthreads come here to wait for additional callbacks to show up * or for grace periods to end. */ -static void nocb_gp_wait(struct rcu_data *my_rdp) +static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp) { bool bypass = false; int __maybe_unused cpu = my_rdp->cpu; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b67532cb8770..cf7ae51cba40 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -479,7 +479,6 @@ static void print_cpu_stat_info(int cpu) { struct rcu_snap_record rsr, *rsrp; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu); if (!rcu_cpu_stall_cputime) return; @@ -488,9 +487,9 @@ static void print_cpu_stat_info(int cpu) if (rsrp->gp_seq != rdp->gp_seq) return; - rsr.cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); - rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); - rsr.cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); + rsr.cputime_irq = kcpustat_field(CPUTIME_IRQ, cpu); + rsr.cputime_softirq = kcpustat_field(CPUTIME_SOFTIRQ, cpu); + rsr.cputime_system = kcpustat_field(CPUTIME_SYSTEM, cpu); pr_err("\t hardirqs softirqs csw/system\n"); pr_err("\t number: %8lld %10d %12lld\n", diff --git a/kernel/rseq.c b/kernel/rseq.c index 38d3ef540760..e75e3a5e312c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -236,11 +236,6 @@ static int __init rseq_debugfs_init(void) } __initcall(rseq_debugfs_init); -static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) -{ - return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); -} - static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) { struct rseq __user *urseq = t->rseq.usrptr; @@ -258,14 +253,16 @@ efault: static void rseq_slowpath_update_usr(struct pt_regs *regs) { /* - * Preserve rseq state and user_irq state. The generic entry code - * clears user_irq on the way out, the non-generic entry - * architectures are not having user_irq. + * Preserve has_rseq and user_irq state. The generic entry code clears + * user_irq on the way out, the non-generic entry architectures are not + * setting user_irq. */ - const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; + const struct rseq_event evt_mask = { + .has_rseq = RSEQ_HAS_RSEQ_VERSION_MASK, + .user_irq = true, + }; struct task_struct *t = current; struct rseq_ids ids; - u32 node_id; bool event; if (unlikely(t->flags & PF_EXITING)) @@ -301,9 +298,9 @@ static void rseq_slowpath_update_usr(struct pt_regs *regs) if (!event) return; - node_id = cpu_to_node(ids.cpu_id); + ids.node_id = cpu_to_node(ids.cpu_id); - if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { + if (unlikely(!rseq_update_usr(t, regs, &ids))) { /* * Clear the errors just in case this might survive magically, but * leave the rest intact. @@ -335,8 +332,9 @@ void __rseq_handle_slowpath(struct pt_regs *regs) void __rseq_signal_deliver(int sig, struct pt_regs *regs) { rseq_stat_inc(rseq_stats.signal); + /* - * Don't update IDs, they are handled on exit to user if + * Don't update IDs yet, they are handled on exit to user if * necessary. The important thing is to abort a critical section of * the interrupted context as after this point the instruction * pointer in @regs points to the signal handler. @@ -349,6 +347,13 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs) current->rseq.event.error = 0; force_sigsegv(sig); } + + /* + * In legacy mode, force the update of IDs before returning to user + * space to stay compatible. + */ + if (!rseq_v2(current)) + rseq_force_update(); } /* @@ -384,19 +389,22 @@ void rseq_syscall(struct pt_regs *regs) static bool rseq_reset_ids(void) { - struct rseq_ids ids = { - .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, - .mm_cid = 0, - }; + struct rseq __user *rseq = current->rseq.usrptr; /* * If this fails, terminate it because this leaves the kernel in * stupid state as exit to user space will try to fixup the ids * again. */ - if (rseq_set_ids(current, &ids, 0)) - return true; + scoped_user_rw_access(rseq, efault) { + unsafe_put_user(0, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0, &rseq->node_id, efault); + unsafe_put_user(0, &rseq->mm_cid, efault); + } + return true; +efault: force_sig(SIGSEGV); return false; } @@ -404,70 +412,29 @@ static bool rseq_reset_ids(void) /* The original rseq structure size (including padding) is 32 bytes. */ #define ORIG_RSEQ_SIZE 32 -/* - * sys_rseq - setup restartable sequences for caller thread. - */ -SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) +static long rseq_register(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) { u32 rseqfl = 0; + u8 version = 1; - if (flags & RSEQ_FLAG_UNREGISTER) { - if (flags & ~RSEQ_FLAG_UNREGISTER) - return -EINVAL; - /* Unregister rseq for current thread. */ - if (current->rseq.usrptr != rseq || !current->rseq.usrptr) - return -EINVAL; - if (rseq_len != current->rseq.len) - return -EINVAL; - if (current->rseq.sig != sig) - return -EPERM; - if (!rseq_reset_ids()) - return -EFAULT; - rseq_reset(current); - return 0; - } - - if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))) - return -EINVAL; - - if (current->rseq.usrptr) { - /* - * If rseq is already registered, check whether - * the provided address differs from the prior - * one. - */ - if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) - return -EINVAL; - if (current->rseq.sig != sig) - return -EPERM; - /* Already registered. */ - return -EBUSY; - } - - /* - * If there was no rseq previously registered, ensure the provided rseq - * is properly aligned, as communcated to user-space through the ELF - * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq - * size, the required alignment is the original struct rseq alignment. - * - * The rseq_len is required to be greater or equal to the original rseq - * size. In order to be valid, rseq_len is either the original rseq size, - * or large enough to contain all supported fields, as communicated to - * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. - */ - if (rseq_len < ORIG_RSEQ_SIZE || - (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || - (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) || - rseq_len < offsetof(struct rseq, end)))) - return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; - if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { - rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; - if (rseq_slice_extension_enabled() && - (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)) - rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + /* + * Architectures, which use the generic IRQ entry code (at least) enable + * registrations with a size greater than the original v1 fixed sized + * @rseq_len, which has been validated already to utilize the optimized + * v2 ABI mode which also enables extended RSEQ features beyond MMCID. + */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && rseq_len > ORIG_RSEQ_SIZE) + version = 2; + + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) { + if (rseq_slice_extension_enabled()) { + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + } } scoped_user_write_access(rseq, efault) { @@ -485,7 +452,15 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); unsafe_put_user(0U, &rseq->node_id, efault); unsafe_put_user(0U, &rseq->mm_cid, efault); - unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + + /* + * All fields past mm_cid are only valid for non-legacy v2 + * registrations. + */ + if (version > 1) { + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + } } /* @@ -501,11 +476,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 #endif /* - * If rseq was previously inactive, and has just been - * registered, ensure the cpu_id_start and cpu_id fields - * are updated before returning to user-space. + * Ensure the cpu_id_start and cpu_id fields are updated before + * returning to user-space. */ - current->rseq.event.has_rseq = true; + current->rseq.event.has_rseq = version; rseq_force_update(); return 0; @@ -513,6 +487,80 @@ efault: return -EFAULT; } +static long rseq_unregister(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) +{ + if (flags & ~RSEQ_FLAG_UNREGISTER) + return -EINVAL; + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) + return -EINVAL; + if (rseq_len != current->rseq.len) + return -EINVAL; + if (current->rseq.sig != sig) + return -EPERM; + if (!rseq_reset_ids()) + return -EFAULT; + rseq_reset(current); + return 0; +} + +static long rseq_reregister(struct rseq __user * rseq, u32 rseq_len, u32 sig) +{ + /* + * If rseq is already registered, check whether the provided address + * differs from the prior one. + */ + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) + return -EINVAL; + if (current->rseq.sig != sig) + return -EPERM; + /* Already registered. */ + return -EBUSY; +} + +static bool rseq_length_valid(struct rseq __user *rseq, unsigned int rseq_len) +{ + /* + * Ensure the provided rseq is properly aligned, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_ALIGN. If + * rseq_len is the original rseq size, the required alignment is the + * original struct rseq alignment. + * + * In order to be valid, rseq_len is either the original rseq size, or + * large enough to contain all supported fields, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. + */ + if (rseq_len < ORIG_RSEQ_SIZE) + return false; + + if (rseq_len == ORIG_RSEQ_SIZE) + return IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE); + + return IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) && + rseq_len >= offsetof(struct rseq, end); +} + +#define RSEQ_FLAGS_SUPPORTED (RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) + +/* + * sys_rseq - Register or unregister restartable sequences for the caller thread. + */ +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) +{ + if (flags & RSEQ_FLAG_UNREGISTER) + return rseq_unregister(rseq, rseq_len, flags, sig); + + if (unlikely(flags & ~RSEQ_FLAGS_SUPPORTED)) + return -EINVAL; + + if (current->rseq.usrptr) + return rseq_reregister(rseq, rseq_len, sig); + + if (!rseq_length_valid(rseq, rseq_len)) + return -EINVAL; + + return rseq_register(rseq, rseq_len, flags, sig); +} + #ifdef CONFIG_RSEQ_SLICE_EXTENSION struct slice_timer { struct hrtimer timer; @@ -713,6 +761,8 @@ int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) return -ENOTSUPP; if (!current->rseq.usrptr) return -ENXIO; + if (!rseq_v2(current)) + return -ENOTSUPP; /* No change? */ if (enable == !!current->rseq.slice.state.enabled) diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index 755883faf751..067979a7b69e 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -58,8 +58,17 @@ #include "deadline.c" #ifdef CONFIG_SCHED_CLASS_EXT +# include <linux/btf_ids.h> +# include <linux/find.h> +# include <linux/genalloc.h> +# include "ext_types.h" # include "ext_internal.h" +# include "ext_cid.h" +# include "ext_arena.h" +# include "ext_idle.h" # include "ext.c" +# include "ext_cid.c" +# include "ext_arena.c" # include "ext_idle.c" #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b8871449d3c6..8b791e9e9f67 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -537,13 +537,22 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } /* need a wrapper since we may need to trace from modules */ EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); -/* Call via the helper macro trace_set_current_state. */ +/* + * Call via the helper macro trace_set_current_state. + * Calls to this function MUST be guarded by a + * tracepoint_enabled(sched_set_state_tp) + */ void __trace_set_current_state(int state_value) { - trace_sched_set_state_tp(current, state_value); + trace_call__sched_set_state_tp(current, state_value); } EXPORT_SYMBOL(__trace_set_current_state); +int task_llc(const struct task_struct *p) +{ + return per_cpu(sd_llc_id, task_cpu(p)); +} + /* * Serialization rules: * @@ -615,6 +624,12 @@ EXPORT_SYMBOL(__trace_set_current_state); * [ The astute reader will observe that it is possible for two tasks on one * CPU to have ->on_cpu = 1 at the same time. ] * + * p->is_blocked <- { 0, 1 }: + * + * is set by try_to_block_task() and cleared by ttwu_do_wakeup() and tracks + * if the task is blocked. Traditionally this would mirror p->on_rq, however + * due things like DELAY_DEQUEUE and PROXY_EXEC, this can diverge. + * * task_cpu(p): is changed by set_task_cpu(), the rules are: * * - Don't call set_task_cpu() on a blocked task: @@ -1203,9 +1218,13 @@ static void __resched_curr(struct rq *rq, int tif) } } +/* + * Calls to this function MUST be guarded by a + * tracepoint_enabled(sched_set_need_resched_tp) + */ void __trace_set_need_resched(struct task_struct *curr, int tif) { - trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif); + trace_call__sched_set_need_resched_tp(curr, smp_processor_id(), tif); } EXPORT_SYMBOL_GPL(__trace_set_need_resched); @@ -2223,8 +2242,29 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } -static void block_task(struct rq *rq, struct task_struct *p, int flags) +static void block_task(struct rq *rq, struct task_struct *p, unsigned long task_state) { + int flags = DEQUEUE_NOCLOCK; + + p->sched_contributes_to_load = + (task_state & TASK_UNINTERRUPTIBLE) && + !(task_state & TASK_NOLOAD) && + !(task_state & TASK_FROZEN); + + if (unlikely(is_special_task_state(task_state))) + flags |= DEQUEUE_SPECIAL; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags)) __block_task(rq, p); } @@ -3685,6 +3725,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) */ static inline void ttwu_do_wakeup(struct task_struct *p) { + p->is_blocked = 0; WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); } @@ -3701,6 +3742,65 @@ void update_rq_avg_idle(struct rq *rq) rq->idle_stamp = 0; } +#ifdef CONFIG_SCHED_PROXY_EXEC +static void zap_balance_callbacks(struct rq *rq); + +static inline void proxy_reset_donor(struct rq *rq) +{ + WARN_ON_ONCE(rq->donor == rq->curr); + + put_prev_set_next_task(rq, rq->donor, rq->curr); + rq_set_donor(rq, rq->curr); + zap_balance_callbacks(rq); + resched_curr(rq); +} + +/* + * Checks to see if task p has been proxy-migrated to another rq + * and needs to be returned. If so, we deactivate the task here + * so that it can be properly woken up on the p->wake_cpu + * (or whichever cpu select_task_rq() picks at the bottom of + * try_to_wake_up() + */ +static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p) +{ + /* + * Typically per __set_task_cpu(), task_cpu(p) == p->wake_cpu. + * + * However, proxy_set_task_cpu() is such that it preserves the + * original cpu in p->wake_cpu while migrating p for proxy reasons + * (possibly outside of the allowed p->cpus_ptr). + * + * Furthermore, migration_cpu_stop() / __migrate_swap_task(), will + * only set p->wake_cpu when !p->on_rq, and since here p->on_rq, this + * will not apply. But if it did, this check is the safe way around + * and would migrate. + */ + if (task_cpu(p) == p->wake_cpu) + return false; + + scoped_guard(raw_spinlock, &p->blocked_lock) { + /* Task is waking up; clear any blocked_on relationship */ + __clear_task_blocked_on(p, NULL); + + /* If already current, don't need to return migrate */ + if (task_current(rq, p)) + return false; + + /* If we're return migrating the rq->donor, switch it out for idle */ + if (task_current_donor(rq, p)) + proxy_reset_donor(rq); + } + block_task(rq, p, TASK_WAKING); + return true; +} +#else /* !CONFIG_SCHED_PROXY_EXEC */ +static inline bool proxy_needs_return(struct rq *rq, struct task_struct *p) +{ + return false; +} +#endif /* CONFIG_SCHED_PROXY_EXEC */ + static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) @@ -3716,8 +3816,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; - else - if (p->in_iowait) { + else if (p->in_iowait) { delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -3765,28 +3864,28 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, */ static int ttwu_runnable(struct task_struct *p, int wake_flags) { - struct rq_flags rf; - struct rq *rq; - int ret = 0; + ACQUIRE(__task_rq_lock, guard)(p); + struct rq *rq = guard.rq; - rq = __task_rq_lock(p, &rf); - if (task_on_rq_queued(p)) { - update_rq_clock(rq); + if (!task_on_rq_queued(p)) + return 0; + + update_rq_clock(rq); + if (p->is_blocked) { if (p->se.sched_delayed) enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); - if (!task_on_cpu(rq, p)) { - /* - * When on_rq && !on_cpu the task is preempted, see if - * it should preempt the task that is current now. - */ - wakeup_preempt(rq, p, wake_flags); - } - ttwu_do_wakeup(p); - ret = 1; + if (proxy_needs_return(rq, p)) + return 0; } - __task_rq_unlock(rq, p, &rf); - - return ret; + if (!task_on_cpu(rq, p)) { + /* + * When on_rq && !on_cpu the task is preempted, see if + * it should preempt the task that is current now. + */ + wakeup_preempt(rq, p, wake_flags); + } + ttwu_do_wakeup(p); + return 1; } void sched_ttwu_pending(void *arg) @@ -4173,6 +4272,9 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * it disabling IRQs (this allows not taking ->pi_lock). */ WARN_ON_ONCE(p->se.sched_delayed); + WARN_ON_ONCE(p->is_blocked); + /* If p is current, we know we can run here, so clear blocked_on */ + clear_task_blocked_on(p, NULL); if (!ttwu_state_match(p, state, &success)) goto out; @@ -4189,6 +4291,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { smp_mb__after_spinlock(); + if (!ttwu_state_match(p, state, &success)) break; @@ -4297,6 +4400,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); + } else if (cpu != p->wake_cpu) { + /* + * If we were proxy-migrated to cpu, then + * select_task_rq() picks cpu instead of wake_cpu + * to return to, we won't call set_task_cpu(), + * leaving a stale wake_cpu pointing to where we + * proxy-migrated from. So just fixup wake_cpu here + * if its not correct + */ + p->wake_cpu = cpu; } ttwu_queue(p, cpu, wake_flags); @@ -4463,6 +4576,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) /* A delayed task cannot be in clone(). */ WARN_ON_ONCE(p->se.sched_delayed); + WARN_ON_ONCE(p->is_blocked); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -4498,6 +4612,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) init_numa_balancing(clone_flags, p); p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; + init_sched_mm(p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -4710,6 +4825,7 @@ int sched_fork(u64 clone_flags, struct task_struct *p) p->policy = SCHED_NORMAL; p->static_prio = NICE_TO_PRIO(0); p->rt_priority = 0; + p->timer_slack_ns = p->default_timer_slack_ns; } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); @@ -5518,7 +5634,11 @@ void sched_exec(void) } DEFINE_PER_CPU(struct kernel_stat, kstat); -DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); +DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat) = { +#ifdef CONFIG_NO_HZ_COMMON + .idle_sleeptime_seq = SEQCNT_ZERO(kernel_cpustat.idle_sleeptime_seq) +#endif +}; EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); @@ -5972,10 +6092,9 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) schedstat_inc(this_rq()->sched_count); } -static void prev_balance(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) +static void prev_balance(struct rq *rq, struct rq_flags *rf) { - const struct sched_class *start_class = prev->sched_class; + const struct sched_class *start_class = rq->donor->sched_class; const struct sched_class *class; /* @@ -5987,7 +6106,7 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, * a runnable task of @class priority or higher. */ for_active_class_range(class, start_class, &idle_sched_class) { - if (class->balance && class->balance(rq, prev, rf)) + if (class->balance && class->balance(rq, rf)) break; } } @@ -5996,7 +6115,7 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, * Pick up the highest-prio task: */ static inline struct task_struct * -__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +__pick_next_task(struct rq *rq, struct rq_flags *rf) __must_hold(__rq_lockp(rq)) { const struct sched_class *class; @@ -6013,40 +6132,31 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * higher scheduling class, because otherwise those lose the * opportunity to pull in more work from other CPUs. */ - if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && + if (likely(!sched_class_above(rq->donor->sched_class, &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_queued)) { - p = pick_next_task_fair(rq, prev, rf); + p = pick_task_fair(rq, rf); if (unlikely(p == RETRY_TASK)) goto restart; /* Assume the next prioritized class is idle_sched_class */ - if (!p) { + if (!p) p = pick_task_idle(rq, rf); - put_prev_set_next_task(rq, prev, p); - } + put_prev_set_next_task(rq, rq->donor, p); return p; } restart: - prev_balance(rq, prev, rf); + prev_balance(rq, rf); for_each_active_class(class) { - if (class->pick_next_task) { - p = class->pick_next_task(rq, prev, rf); - if (unlikely(p == RETRY_TASK)) - goto restart; - if (p) - return p; - } else { - p = class->pick_task(rq, rf); - if (unlikely(p == RETRY_TASK)) - goto restart; - if (p) { - put_prev_set_next_task(rq, prev, p); - return p; - } + p = class->pick_task(rq, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; + if (p) { + put_prev_set_next_task(rq, rq->donor, p); + return p; } } @@ -6097,7 +6207,7 @@ extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_f static void queue_core_balance(struct rq *rq); static struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +pick_next_task(struct rq *rq, struct rq_flags *rf) __must_hold(__rq_lockp(rq)) { struct task_struct *next, *p, *max; @@ -6110,7 +6220,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) bool need_sync; if (!sched_core_enabled(rq)) - return __pick_next_task(rq, prev, rf); + return __pick_next_task(rq, rf); cpu = cpu_of(rq); @@ -6123,7 +6233,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) */ rq->core_pick = NULL; rq->core_dl_server = NULL; - return __pick_next_task(rq, prev, rf); + return __pick_next_task(rq, rf); } /* @@ -6147,7 +6257,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) goto out_set_next; } - prev_balance(rq, prev, rf); + prev_balance(rq, rf); smt_mask = cpu_smt_mask(cpu); need_sync = !!rq->core->core_cookie; @@ -6329,7 +6439,7 @@ restart_multi: } out_set_next: - put_prev_set_next_task(rq, prev, next); + put_prev_set_next_task(rq, rq->donor, next); if (rq->core->core_forceidle_count && next == rq->idle) queue_core_balance(rq); @@ -6552,10 +6662,10 @@ static inline void sched_core_cpu_deactivate(unsigned int cpu) {} static inline void sched_core_cpu_dying(unsigned int cpu) {} static struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +pick_next_task(struct rq *rq, struct rq_flags *rf) __must_hold(__rq_lockp(rq)) { - return __pick_next_task(rq, prev, rf); + return __pick_next_task(rq, rf); } #endif /* !CONFIG_SCHED_CORE */ @@ -6583,16 +6693,19 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, unsigned long *task_state_p, bool should_block) { unsigned long task_state = *task_state_p; - int flags = DEQUEUE_NOCLOCK; + + WARN_ON_ONCE(p->is_blocked); if (signal_pending_state(task_state, p)) { WRITE_ONCE(p->__state, TASK_RUNNING); *task_state_p = TASK_RUNNING; - set_task_blocked_on_waking(p, NULL); + clear_task_blocked_on(p, NULL); return false; } + p->is_blocked = 1; + /* * We check should_block after signal_pending because we * will want to wake the task in that case. But if @@ -6603,26 +6716,7 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, if (!should_block) return false; - p->sched_contributes_to_load = - (task_state & TASK_UNINTERRUPTIBLE) && - !(task_state & TASK_NOLOAD) && - !(task_state & TASK_FROZEN); - - if (unlikely(is_special_task_state(task_state))) - flags |= DEQUEUE_SPECIAL; - - /* - * __schedule() ttwu() - * prev_state = prev->state; if (p->on_rq && ...) - * if (prev_state) goto out; - * p->on_rq = 0; smp_acquire__after_ctrl_dep(); - * p->state = TASK_WAKING - * - * Where __schedule() and ttwu() have matching control dependencies. - * - * After this, schedule() must not care about p->state any more. - */ - block_task(rq, p, flags); + block_task(rq, p, task_state); return true; } @@ -6645,18 +6739,18 @@ static inline void proxy_set_task_cpu(struct task_struct *p, int cpu) static inline struct task_struct *proxy_resched_idle(struct rq *rq) { put_prev_set_next_task(rq, rq->donor, rq->idle); + rq->next_class = &idle_sched_class; rq_set_donor(rq, rq->idle); set_tsk_need_resched(rq->idle); return rq->idle; } -static bool proxy_deactivate(struct rq *rq, struct task_struct *donor) +static void proxy_deactivate(struct rq *rq, struct task_struct *donor) { unsigned long state = READ_ONCE(donor->__state); - /* Don't deactivate if the state has been changed to TASK_RUNNING */ - if (state == TASK_RUNNING) - return false; + WARN_ON_ONCE(state == TASK_RUNNING); + WARN_ON_ONCE(donor->blocked_on); /* * Because we got donor from pick_next_task(), it is *crucial* * that we call proxy_resched_idle() before we deactivate it. @@ -6667,7 +6761,7 @@ static bool proxy_deactivate(struct rq *rq, struct task_struct *donor) * need to be changed from next *before* we deactivate. */ proxy_resched_idle(rq); - return try_to_block_task(rq, donor, &state, true); + block_task(rq, donor, state); } static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf) @@ -6741,76 +6835,21 @@ static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf, proxy_reacquire_rq_lock(rq, rf); } -static void proxy_force_return(struct rq *rq, struct rq_flags *rf, - struct task_struct *p) - __must_hold(__rq_lockp(rq)) -{ - struct rq *task_rq, *target_rq = NULL; - int cpu, wake_flag = WF_TTWU; - - lockdep_assert_rq_held(rq); - WARN_ON(p == rq->curr); - - if (p == rq->donor) - proxy_resched_idle(rq); - - proxy_release_rq_lock(rq, rf); - /* - * We drop the rq lock, and re-grab task_rq_lock to get - * the pi_lock (needed for select_task_rq) as well. - */ - scoped_guard (task_rq_lock, p) { - task_rq = scope.rq; - - /* - * Since we let go of the rq lock, the task may have been - * woken or migrated to another rq before we got the - * task_rq_lock. So re-check we're on the same RQ. If - * not, the task has already been migrated and that CPU - * will handle any futher migrations. - */ - if (task_rq != rq) - break; - - /* - * Similarly, if we've been dequeued, someone else will - * wake us - */ - if (!task_on_rq_queued(p)) - break; - - /* - * Since we should only be calling here from __schedule() - * -> find_proxy_task(), no one else should have - * assigned current out from under us. But check and warn - * if we see this, then bail. - */ - if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) { - WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d on_cpu: %i\n", - __func__, cpu_of(task_rq), - p->comm, p->pid, p->on_cpu); - break; - } - - update_rq_clock(task_rq); - deactivate_task(task_rq, p, DEQUEUE_NOCLOCK); - cpu = select_task_rq(p, p->wake_cpu, &wake_flag); - set_task_cpu(p, cpu); - target_rq = cpu_rq(cpu); - clear_task_blocked_on(p, NULL); - } - - if (target_rq) - attach_one_task(target_rq, p); - - proxy_reacquire_rq_lock(rq, rf); -} - /* * Find runnable lock owner to proxy for mutex blocked donor * * Follow the blocked-on relation: - * task->blocked_on -> mutex->owner -> task... + * + * ,-> task + * | | blocked-on + * | v + * blocked_donor | mutex + * | | owner + * | v + * `-- task + * + * and set the blocked_donor relation, this latter is used by the mutex + * code to find which (blocked) task to hand-off to. * * Lock order: * @@ -6830,18 +6869,19 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) bool curr_in_chain = false; int this_cpu = cpu_of(rq); struct task_struct *p; - struct mutex *mutex; int owner_cpu; /* Follow blocked_on chain. */ - for (p = donor; (mutex = p->blocked_on); p = owner) { + for (p = donor; p->is_blocked; p = owner) { /* if its PROXY_WAKING, do return migration or run if current */ - if (mutex == PROXY_WAKING) { + struct mutex *mutex = p->blocked_on; + if (!mutex) { + clear_task_blocked_on(p, mutex); if (task_current(rq, p)) { - clear_task_blocked_on(p, PROXY_WAKING); + p->is_blocked = 0; return p; } - goto force_return; + goto deactivate; } /* @@ -6872,17 +6912,19 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) * and return p (if it is current and safe to * just run on this rq), or return-migrate the task. */ + __clear_task_blocked_on(p, NULL); if (task_current(rq, p)) { - __clear_task_blocked_on(p, NULL); + p->is_blocked = 0; return p; } - goto force_return; + goto deactivate; } if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { /* XXX Don't handle blocked owners/delayed dequeue yet */ if (curr_in_chain) return proxy_resched_idle(rq); + __clear_task_blocked_on(p, NULL); goto deactivate; } @@ -6950,17 +6992,13 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) * rq, therefore holding @rq->lock is sufficient to * guarantee its existence, as per ttwu_remote(). */ + owner->blocked_donor = p; } WARN_ON_ONCE(owner && !owner->on_rq); return owner; deactivate: - if (proxy_deactivate(rq, donor)) - return NULL; - /* If deactivate fails, force return */ - p = donor; -force_return: - proxy_force_return(rq, rf, p); + proxy_deactivate(rq, p); return NULL; migrate_task: proxy_migrate_task(rq, rf, p, owner_cpu); @@ -7102,13 +7140,14 @@ static void __sched notrace __schedule(int sched_mode) pick_again: assert_balance_callbacks_empty(rq); - next = pick_next_task(rq, rq->donor, &rf); + next = pick_next_task(rq, &rf); rq->next_class = next->sched_class; if (sched_proxy_exec()) { struct task_struct *prev_donor = rq->donor; rq_set_donor(rq, next); - if (unlikely(next->blocked_on)) { + next->blocked_donor = NULL; + if (unlikely(next->is_blocked)) { next = find_proxy_task(rq, next, &rf); if (!next) { zap_balance_callbacks(rq); @@ -7964,7 +8003,7 @@ static void __sched_dynamic_update(int mode) break; } - preempt_dynamic_mode = mode; + WRITE_ONCE(preempt_dynamic_mode, mode); } void sched_dynamic_update(int mode) @@ -8005,12 +8044,13 @@ static void __init preempt_dynamic_init(void) } } -# define PREEMPT_MODEL_ACCESSOR(mode) \ - bool preempt_model_##mode(void) \ - { \ - WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ - return preempt_dynamic_mode == preempt_dynamic_##mode; \ - } \ +# define PREEMPT_MODEL_ACCESSOR(mode) \ + bool preempt_model_##mode(void) \ + { \ + int mode = READ_ONCE(preempt_dynamic_mode); \ + WARN_ON_ONCE(mode == preempt_dynamic_undefined); \ + return mode == preempt_dynamic_##mode; \ + } \ EXPORT_SYMBOL_GPL(preempt_model_##mode) PREEMPT_MODEL_ACCESSOR(none); @@ -8604,18 +8644,14 @@ static void cpuset_cpu_inactive(unsigned int cpu) static inline void sched_smt_present_inc(int cpu) { -#ifdef CONFIG_SCHED_SMT if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_inc_cpuslocked(&sched_smt_present); -#endif } static inline void sched_smt_present_dec(int cpu) { -#ifdef CONFIG_SCHED_SMT if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_dec_cpuslocked(&sched_smt_present); -#endif } int sched_cpu_activate(unsigned int cpu) @@ -8670,7 +8706,8 @@ int sched_cpu_deactivate(unsigned int cpu) * Remove CPU from nohz.idle_cpus_mask to prevent participating in * load balancing when not active */ - nohz_balance_exit_idle(rq); + scoped_guard (rcu) + nohz_balance_exit_idle(rq); set_cpu_active(cpu, false); @@ -8694,6 +8731,8 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu(); + sched_domains_free_llc_id(cpu); + sched_set_rq_offline(rq, cpu); scx_rq_deactivate(rq); @@ -8703,9 +8742,7 @@ int sched_cpu_deactivate(unsigned int cpu) */ sched_smt_present_dec(cpu); -#ifdef CONFIG_SCHED_SMT sched_core_cpu_deactivate(cpu); -#endif if (!sched_smp_initialized) return 0; @@ -8873,7 +8910,7 @@ static struct kmem_cache *task_group_cache __ro_after_init; void __init sched_init(void) { - unsigned long ptr = 0; + unsigned long __maybe_unused ptr = 0; int i; /* Make sure the linker didn't screw up */ @@ -8889,36 +8926,24 @@ void __init sched_init(void) wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED - ptr += 2 * nr_cpu_ids * sizeof(void **); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - ptr += 2 * nr_cpu_ids * sizeof(void **); -#endif - if (ptr) { - ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); - -#ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.se = (struct sched_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); + root_task_group.cfs_rq = &runqueues.cfs; - root_task_group.cfs_rq = (struct cfs_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.shares = ROOT_TASK_GROUP_LOAD; - init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_EXT_GROUP_SCHED - scx_tg_init(&root_task_group); + scx_tg_init(&root_task_group); #endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED - root_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); + ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); - root_task_group.rt_rq = (struct rt_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ - } init_defrootdomain(); @@ -9027,6 +9052,11 @@ void __init sched_init(void) rq->core_cookie = 0UL; #endif +#ifdef CONFIG_SCHED_CACHE + raw_spin_lock_init(&rq->cpu_epoch_lock); + rq->cpu_epoch_next = jiffies; +#endif + zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); } @@ -9828,15 +9858,18 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, } for_each_online_cpu(i) { - struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, i); struct rq *rq = cfs_rq->rq; guard(rq_lock_irq)(rq); + cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 1; - if (cfs_rq->throttled) + if (cfs_rq->throttled) { + update_rq_clock(rq); unthrottle_cfs_rq(cfs_rq); + } } if (runtime_was_enabled && !runtime_enabled) @@ -9977,7 +10010,7 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) int i; for_each_possible_cpu(i) { - stats = __schedstats_from_se(tg->se[i]); + stats = __schedstats_from_se(tg_se(tg, i)); ws += schedstat_val(stats->wait_sum); } @@ -9996,7 +10029,7 @@ static u64 throttled_time_self(struct task_group *tg) u64 total = 0; for_each_possible_cpu(i) { - total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time); + total += READ_ONCE(tg_cfs_rq(tg, i)->throttled_clock_self_time); } return total; diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 73b6b2426911..43e0bde3038e 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -136,7 +136,7 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, struct pid *grp; int err = 0; - if (!static_branch_likely(&sched_smt_present)) + if (!sched_smt_active()) return -ENODEV; BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD != PIDTYPE_PID); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index fbf31db0d2f3..679ac65be6b0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,6 +2,7 @@ /* * Simple CPU accounting cgroup controller */ +#include <linux/sched/clock.h> #include <linux/sched/cputime.h> #include <linux/tsacct_kern.h> #include "sched.h" @@ -46,7 +47,8 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, u64_stats_update_begin(&irqtime->sync); cpustat[idx] += delta; irqtime->total += delta; - irqtime->tick_delta += delta; + if (!kcpustat_idle_dyntick()) + irqtime->tick_delta += delta; u64_stats_update_end(&irqtime->sync); } @@ -414,16 +416,219 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, } } -static void irqtime_account_idle_ticks(int ticks) -{ - irqtime_account_process_tick(current, 0, ticks); -} #else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ -static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, int nr_ticks) { } #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ +#ifdef CONFIG_NO_HZ_COMMON +static void kcpustat_idle_stop(struct kernel_cpustat *kc, u64 now) +{ + u64 *cpustat = kc->cpustat; + u64 delta, steal, steal_delta; + int iowait; + + if (!kc->idle_elapse) + return; + + iowait = nr_iowait_cpu(smp_processor_id()) > 0; + delta = now - kc->idle_entrytime; + steal = steal_account_process_time(delta); + + /* + * Record the idle time after substracting the steal time from + * previous update sequence. Don't substract the steal time from + * the current update sequence to avoid readers moving backward. + */ + write_seqcount_begin(&kc->idle_sleeptime_seq); + steal_delta = min_t(u64, kc->idle_stealtime[iowait], delta); + delta -= steal_delta; + kc->idle_stealtime[iowait] -= steal_delta; + + if (iowait) + cpustat[CPUTIME_IOWAIT] += delta; + else + cpustat[CPUTIME_IDLE] += delta; + + kc->idle_stealtime[iowait] += steal; + kc->idle_entrytime = now; + kc->idle_elapse = false; + write_seqcount_end(&kc->idle_sleeptime_seq); +} + +static void kcpustat_idle_start(struct kernel_cpustat *kc, u64 now) +{ + /* Irqtime accounting might have been enabled in the middle of the IRQ */ + if (kc->idle_elapse) + return; + + write_seqcount_begin(&kc->idle_sleeptime_seq); + kc->idle_entrytime = now; + kc->idle_elapse = true; + write_seqcount_end(&kc->idle_sleeptime_seq); +} + +void kcpustat_dyntick_stop(u64 now) +{ + struct kernel_cpustat *kc = kcpustat_this_cpu; + + if (!vtime_generic_enabled_this_cpu()) { + WARN_ON_ONCE(!kc->idle_dyntick); + kcpustat_idle_stop(kc, now); + kc->idle_dyntick = false; + vtime_dyntick_stop(); + } +} + +void kcpustat_dyntick_start(u64 now) +{ + struct kernel_cpustat *kc = kcpustat_this_cpu; + + if (!vtime_generic_enabled_this_cpu()) { + vtime_dyntick_start(); + kc->idle_dyntick = true; + kcpustat_idle_start(kc, now); + } +} + +void kcpustat_irq_enter(u64 now) +{ + struct kernel_cpustat *kc = kcpustat_this_cpu; + + if (!vtime_generic_enabled_this_cpu() && + (irqtime_enabled() || vtime_accounting_enabled_this_cpu())) + kcpustat_idle_stop(kc, now); +} + +void kcpustat_irq_exit(u64 now) +{ + struct kernel_cpustat *kc = kcpustat_this_cpu; + + /* + * Generic vtime already does its own idle accounting. + * But irqtime accounting or arch vtime which also accounts IRQs + * need to pause nohz accounting. Resume nohz accounting as long + * as the irqtime config is enabled to handle case where irqtime + * accounting got runtime disabled in the middle of an IRQ. + */ + if (!vtime_generic_enabled_this_cpu() && + (IS_ENABLED(CONFIG_IRQ_TIME_ACCOUNTING) || vtime_accounting_enabled_this_cpu())) + kcpustat_idle_start(kc, now); +} + +static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx, + bool compute_delta, u64 now) +{ + struct kernel_cpustat *kc = &kcpustat_cpu(cpu); + int iowait = idx == CPUTIME_IOWAIT; + u64 *cpustat = kc->cpustat; + unsigned int seq; + u64 idle; + + do { + seq = read_seqcount_begin(&kc->idle_sleeptime_seq); + + idle = cpustat[idx]; + + if (kc->idle_elapse && compute_delta && now > kc->idle_entrytime) { + u64 delta = now - kc->idle_entrytime; + + delta -= min_t(u64, kc->idle_stealtime[iowait], delta); + idle += delta; + } + } while (read_seqcount_retry(&kc->idle_sleeptime_seq, seq)); + + return idle; +} + +u64 kcpustat_field_idle(int cpu) +{ + return kcpustat_field_dyntick(cpu, CPUTIME_IDLE, + !nr_iowait_cpu(cpu), ktime_get()); +} +EXPORT_SYMBOL_GPL(kcpustat_field_idle); + +u64 kcpustat_field_iowait(int cpu) +{ + return kcpustat_field_dyntick(cpu, CPUTIME_IOWAIT, + nr_iowait_cpu(cpu), ktime_get()); +} +EXPORT_SYMBOL_GPL(kcpustat_field_iowait); +#else +static u64 kcpustat_field_dyntick(int cpu, enum cpu_usage_stat idx, + bool compute_delta, ktime_t now) +{ + return kcpustat_cpu(cpu).cpustat[idx]; +} +#endif /* CONFIG_NO_HZ_COMMON */ + +static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx, + bool compute_delta, u64 *last_update_time) +{ + ktime_t now = ktime_get(); + u64 res; + + if (vtime_generic_enabled_cpu(cpu)) + res = kcpustat_field(idx, cpu); + else + res = kcpustat_field_dyntick(cpu, idx, compute_delta, now); + + do_div(res, NSEC_PER_USEC); + + if (last_update_time) + *last_update_time = ktime_to_us(now); + + return res; +} + +/** + * get_cpu_idle_time_us - get the total idle time of a CPU + * @cpu: CPU number to query + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. + * + * Return the cumulative idle time (since boot) for a given + * CPU, in microseconds. Note that this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * Return: total idle time of the @cpu + */ +u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) +{ + return get_cpu_sleep_time_us(cpu, CPUTIME_IDLE, + !nr_iowait_cpu(cpu), last_update_time); +} +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); + +/** + * get_cpu_iowait_time_us - get the total iowait time of a CPU + * @cpu: CPU number to query + * @last_update_time: variable to store update time in. Do not update + * counters if NULL. + * + * Return the cumulative iowait time (since boot) for a given + * CPU, in microseconds. Note this is partially broken due to + * the counter of iowait tasks that can be remotely updated without + * any synchronization. Therefore it is possible to observe backward + * values within two consecutive reads. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * Return: total iowait time of @cpu + */ +u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) +{ + return get_cpu_sleep_time_us(cpu, CPUTIME_IOWAIT, + nr_iowait_cpu(cpu), last_update_time); +} +EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); + /* * Use precise platform statistics if available: */ @@ -437,11 +642,15 @@ void vtime_account_irq(struct task_struct *tsk, unsigned int offset) vtime_account_hardirq(tsk); } else if (pc & SOFTIRQ_OFFSET) { vtime_account_softirq(tsk); - } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && - is_idle_task(tsk)) { - vtime_account_idle(tsk); + } else if (!kcpustat_idle_dyntick()) { + if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && + is_idle_task(tsk)) { + vtime_account_idle(tsk); + } else { + vtime_account_kernel(tsk); + } } else { - vtime_account_kernel(tsk); + vtime_reset(); } } @@ -483,6 +692,9 @@ void account_process_tick(struct task_struct *p, int user_tick) if (vtime_accounting_enabled_this_cpu()) return; + if (kcpustat_idle_dyntick()) + return; + if (irqtime_enabled()) { irqtime_account_process_tick(p, user_tick, 1); return; @@ -505,29 +717,6 @@ void account_process_tick(struct task_struct *p, int user_tick) } /* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) -{ - u64 cputime, steal; - - if (irqtime_enabled()) { - irqtime_account_idle_ticks(ticks); - return; - } - - cputime = ticks * TICK_NSEC; - steal = steal_account_process_time(ULONG_MAX); - - if (steal >= cputime) - return; - - cputime -= steal; - account_idle_time(cputime); -} - -/* * Adjust tick based cputime random precision against scheduler runtime * accounting. * @@ -587,12 +776,6 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, } stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); - /* - * Because mul_u64_u64_div_u64() can approximate on some - * achitectures; enforce the constraint that: a*b/(b+c) <= a. - */ - if (unlikely(stime > rtime)) - stime = rtime; update: /* @@ -773,9 +956,9 @@ void vtime_guest_exit(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(vtime_guest_exit); -void vtime_account_idle(struct task_struct *tsk) +static void __vtime_account_idle(struct vtime *vtime) { - account_idle_time(get_vtime_delta(&tsk->vtime)); + account_idle_time(get_vtime_delta(vtime)); } void vtime_task_switch_generic(struct task_struct *prev) @@ -784,7 +967,7 @@ void vtime_task_switch_generic(struct task_struct *prev) write_seqcount_begin(&vtime->seqcount); if (vtime->state == VTIME_IDLE) - vtime_account_idle(prev); + __vtime_account_idle(vtime); else __vtime_account_kernel(prev, vtime); vtime->state = VTIME_INACTIVE; @@ -926,6 +1109,7 @@ static int kcpustat_field_vtime(u64 *cpustat, int cpu, u64 *val) { struct vtime *vtime = &tsk->vtime; + struct rq *rq = cpu_rq(cpu); unsigned int seq; do { @@ -967,6 +1151,14 @@ static int kcpustat_field_vtime(u64 *cpustat, if (state == VTIME_GUEST && task_nice(tsk) > 0) *val += vtime->gtime + vtime_delta(vtime); break; + case CPUTIME_IDLE: + if (state == VTIME_IDLE && !atomic_read(&rq->nr_iowait)) + *val += vtime_delta(vtime); + break; + case CPUTIME_IOWAIT: + if (state == VTIME_IDLE && atomic_read(&rq->nr_iowait) > 0) + *val += vtime_delta(vtime); + break; default: break; } @@ -975,16 +1167,15 @@ static int kcpustat_field_vtime(u64 *cpustat, return 0; } -u64 kcpustat_field(struct kernel_cpustat *kcpustat, - enum cpu_usage_stat usage, int cpu) +u64 kcpustat_field(enum cpu_usage_stat usage, int cpu) { - u64 *cpustat = kcpustat->cpustat; + u64 *cpustat = kcpustat_cpu(cpu).cpustat; u64 val = cpustat[usage]; struct rq *rq; int err; - if (!vtime_accounting_enabled_cpu(cpu)) - return val; + if (!vtime_generic_enabled_cpu(cpu)) + return kcpustat_field_default(usage, cpu); rq = cpu_rq(cpu); @@ -1030,8 +1221,8 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, *dst = *src; cpustat = dst->cpustat; - /* Task is sleeping, dead or idle, nothing to add */ - if (state < VTIME_SYS) + /* Task is sleeping or dead, nothing to add */ + if (state < VTIME_IDLE) continue; delta = vtime_delta(vtime); @@ -1040,15 +1231,17 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, * Task runs either in user (including guest) or kernel space, * add pending nohz time to the right place. */ - if (state == VTIME_SYS) { + switch (state) { + case VTIME_SYS: cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; - } else if (state == VTIME_USER) { + break; + case VTIME_USER: if (task_nice(tsk) > 0) cpustat[CPUTIME_NICE] += vtime->utime + delta; else cpustat[CPUTIME_USER] += vtime->utime + delta; - } else { - WARN_ON_ONCE(state != VTIME_GUEST); + break; + case VTIME_GUEST: if (task_nice(tsk) > 0) { cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; cpustat[CPUTIME_NICE] += vtime->gtime + delta; @@ -1056,6 +1249,15 @@ static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, cpustat[CPUTIME_GUEST] += vtime->gtime + delta; cpustat[CPUTIME_USER] += vtime->gtime + delta; } + break; + case VTIME_IDLE: + if (atomic_read(&cpu_rq(cpu)->nr_iowait) > 0) + cpustat[CPUTIME_IOWAIT] += delta; + else + cpustat[CPUTIME_IDLE] += delta; + break; + default: + WARN_ON_ONCE(1); } } while (read_seqcount_retry(&vtime->seqcount, seq)); @@ -1068,8 +1270,8 @@ void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) struct rq *rq; int err; - if (!vtime_accounting_enabled_cpu(cpu)) { - *dst = *src; + if (!vtime_generic_enabled_cpu(cpu)) { + kcpustat_cpu_fetch_default(dst, cpu); return; } @@ -1082,7 +1284,7 @@ void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) curr = rcu_dereference(rq->curr); if (WARN_ON_ONCE(!curr)) { rcu_read_unlock(); - *dst = *src; + kcpustat_cpu_fetch_default(dst, cpu); return; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index edca7849b165..0f858b98c9aa 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1515,8 +1515,12 @@ throttle: if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) { if (dl_server(dl_se)) { - replenish_dl_new_period(dl_se, rq); - start_dl_timer(dl_se); + if (dl_se->dl_defer) { + replenish_dl_new_period(dl_se, rq); + start_dl_timer(dl_se); + } else { + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + } } else { enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH); } @@ -1793,7 +1797,8 @@ void dl_server_start(struct sched_dl_entity *dl_se) struct rq *rq = dl_se->rq; dl_se->dl_defer_idle = 0; - if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime) + if (!dl_server(dl_se) || dl_se->dl_server_active || !dl_se->dl_runtime || + !dl_se->dl_bw_attached) return; /* @@ -1868,6 +1873,13 @@ void sched_init_dl_servers(void) dl_se->dl_server = 1; dl_se->dl_defer = 1; setup_new_dl_entity(dl_se); + + /* + * No BPF scheduler is loaded at boot, so the ext_server has no + * tasks to protect. Detach its bandwidth reservation, it will + * be attached when a BPF scheduler is loaded. + */ + dl_server_detach_bw(dl_se); #endif } } @@ -1878,6 +1890,9 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) int cpu = cpu_of(rq); struct dl_bw *dl_b; + if (!dl_se->dl_bw_attached) + return; + dl_b = dl_bw_of(cpu_of(rq)); guard(raw_spinlock)(&dl_b->lock); @@ -1889,7 +1904,8 @@ void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) { - u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); + u64 old_bw = (init || !dl_se->dl_bw_attached) ? 0 : + to_ratio(dl_se->dl_period, dl_se->dl_runtime); u64 new_bw = to_ratio(period, runtime); struct rq *rq = dl_se->rq; int cpu = cpu_of(rq); @@ -1909,7 +1925,8 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio if (init) { __add_rq_bw(new_bw, &rq->dl); __dl_add(dl_b, new_bw, cpus); - } else { + dl_se->dl_bw_attached = 1; + } else if (dl_se->dl_bw_attached) { __dl_sub(dl_b, dl_se->dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); @@ -1930,6 +1947,181 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio } /* + * Add @dl_se's bw to the root-domain accounting. + * + * Return -EBUSY if attaching would overflow root domain capacity. + */ +static int __dl_server_attach_bw_locked(struct sched_dl_entity *dl_se, + struct dl_bw *dl_b, int cpus) +{ + struct rq *rq = dl_se->rq; + unsigned long cap; + + /* + * Always update @rq->dl.this_bw, but only update @dl_b->total_bw + * (and run the overflow check it gates) while this CPU is active. + * + * This mirrors dl_server_add_bw() during root-domain rebuilds, which + * only publishes bandwidth from active CPUs into @dl_b. + */ + if (cpu_active(cpu_of(rq))) { + cap = dl_bw_capacity(cpu_of(rq)); + if (__dl_overflow(dl_b, cap, 0, dl_se->dl_bw)) + return -EBUSY; + __dl_add(dl_b, dl_se->dl_bw, cpus); + } + __add_rq_bw(dl_se->dl_bw, &rq->dl); + dl_se->dl_bw_attached = 1; + + return 0; +} + +/* + * Drain @dl_se and remove its bw from the root-domain accounting. + */ +static void __dl_server_detach_bw_locked(struct sched_dl_entity *dl_se, + struct dl_bw *dl_b, int cpus) +{ + struct rq *rq = dl_se->rq; + + /* + * If the server is still active (on_rq), dequeue it via + * dl_server_stop(); task_non_contending() will either subtract + * @dl_bw from running_bw immediately (0-lag passed) or set + * dl_non_contending and arm the inactive_timer. + */ + if (dl_se->dl_server_active) + dl_server_stop(dl_se); + + /* + * Drop @dl_se's contribution from this rq's bandwidth accounting, + * mirroring the __add_rq_bw() done at attach time. + */ + dl_rq_change_utilization(rq, dl_se, 0); + + /* + * Update @dl_b only while this CPU is active, matching + * dl_server_add_bw() during root-domain rebuilds. + * + * If this CPU is inactive, its bandwidth is not currently accounted in + * @dl_b->total_bw: either attach skipped adding it, or a rebuild + * already dropped it while re-publishing active CPUs only. + * + * In that case there is nothing to subtract from @dl_b. Just clear + * @dl_se->dl_bw_attached; if the CPU becomes active again, the next + * rebuild will re-publish its bandwidth. + */ + if (cpu_active(cpu_of(rq))) + __dl_sub(dl_b, dl_se->dl_bw, cpus); + dl_se->dl_bw_attached = 0; +} + +/* + * Attach @dl_se's bandwidth to the root domain's total_bw accounting. + * + * Use to dynamically register a dl_server's bandwidth reservation while + * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is + * already attached. + * + * Returns -EBUSY if attaching would overflow the root domain capacity. + */ +int dl_server_attach_bw(struct sched_dl_entity *dl_se) +{ + struct rq *rq = dl_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + int cpus, ret; + + if (dl_se->dl_bw_attached) + return 0; + + scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) { + dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); + ret = __dl_server_attach_bw_locked(dl_se, dl_b, cpus); + } + if (ret) + return ret; + + /* + * The natural 0->nr_running transition that triggers dl_server_start() + * may have happened while @dl_se was still detached (e.g., between + * scx_bypass(false) and the scx_enable() re-balance loop), so kick a + * start here. + * + * dl_server_start() bails out cleanly if there's nothing to schedule or + * it's already active. Skip if @cpu is offline; the server will be + * started naturally on the first enqueue once @cpu comes back. + */ + if (cpu_online(cpu)) + dl_server_start(dl_se); + + return 0; +} + +/* + * Detach @dl_se's bandwidth from the root domain's total_bw accounting. + * + * Use to dynamically unregister a dl_server's bandwidth reservation while + * preserving its configured @dl_runtime / @dl_period. No-op if @dl_se is + * not currently attached. + */ +void dl_server_detach_bw(struct sched_dl_entity *dl_se) +{ + int cpu = cpu_of(dl_se->rq); + struct dl_bw *dl_b; + int cpus; + + if (!dl_se->dl_bw_attached) + return; + + dl_b = dl_bw_of(cpu); + guard(raw_spinlock)(&dl_b->lock); + cpus = dl_bw_cpus(cpu); + __dl_server_detach_bw_locked(dl_se, dl_b, cpus); +} + +/* + * Atomically detach @detach_se and attach @attach_se on the same rq, holding + * @dl_b->lock across both operations so a concurrent sched_setattr() cannot + * steal the bandwidth freed by the detach before the attach can claim it. + * + * Both entities must live on the same rq (same root domain). Returns the + * result of the attach: -EBUSY if attaching @attach_se would overflow root + * domain capacity (in which case both servers end up detached). + */ +int dl_server_swap_bw(struct sched_dl_entity *detach_se, + struct sched_dl_entity *attach_se) +{ + struct rq *rq = detach_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + int cpus, ret; + + WARN_ON_ONCE(attach_se->rq != rq); + + scoped_guard (raw_spinlock, &dl_bw_of(cpu)->lock) { + dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); + + if (detach_se->dl_bw_attached) + __dl_server_detach_bw_locked(detach_se, dl_b, cpus); + + if (attach_se->dl_bw_attached) + ret = 0; + else + ret = __dl_server_attach_bw_locked(attach_se, dl_b, cpus); + } + if (ret) + return ret; + + if (cpu_online(cpu)) + dl_server_start(attach_se); + + return 0; +} + +/* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). */ @@ -2292,7 +2484,10 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { - if (is_dl_boosted(&p->dl)) { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; + + if (is_dl_boosted(dl_se)) { /* * Because of delays in the detection of the overrun of a * thread's runtime, it might be the case that a thread @@ -2305,14 +2500,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * * In this case, the boost overrides the throttle. */ - if (p->dl.dl_throttled) { + if (dl_se->dl_throttled) { /* * The replenish timer needs to be canceled. No * problem if it fires concurrently: boosted threads * are ignored in dl_task_timer(). */ - cancel_replenish_timer(&p->dl); - p->dl.dl_throttled = 0; + cancel_replenish_timer(dl_se); + dl_se->dl_throttled = 0; } } else if (!dl_prio(p->normal_prio)) { /* @@ -2324,7 +2519,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * being boosted again with no means to replenish the runtime and clear * the throttle. */ - p->dl.dl_throttled = 0; + dl_se->dl_throttled = 0; if (!(flags & ENQUEUE_REPLENISH)) printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n", task_pid_nr(p)); @@ -2333,20 +2528,23 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) } check_schedstat_required(); - update_stats_wait_start_dl(dl_rq_of_se(&p->dl), &p->dl); + update_stats_wait_start_dl(dl_rq, dl_se); - if (p->on_rq == TASK_ON_RQ_MIGRATING) + if (task_on_rq_migrating(p)) flags |= ENQUEUE_MIGRATING; - enqueue_dl_entity(&p->dl, flags); + enqueue_dl_entity(dl_se, flags); - if (dl_server(&p->dl)) + if (dl_server(dl_se)) return; if (task_is_blocked(p)) return; - if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) + if (dl_rq->curr == dl_se) + return; + + if (!task_current(rq, p) && !dl_se->dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -2354,7 +2552,7 @@ static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); - if (p->on_rq == TASK_ON_RQ_MIGRATING) + if (task_on_rq_migrating(p)) flags |= DEQUEUE_MIGRATING; dequeue_dl_entity(&p->dl, flags); @@ -2506,8 +2704,14 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_curr(rq); } -static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +static int balance_dl(struct rq *rq, struct rq_flags *rf) { + /* + * Note, rq->donor may change during rq lock drops, + * so don't re-use prev across lock drops + */ + struct task_struct *p = rq->donor; + if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { /* * This is OK, because current is on_cpu, which avoids it being @@ -2562,6 +2766,10 @@ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) } #endif /* !CONFIG_SCHED_HRTICK */ +/* + * DL keeps current in tree, because ->deadline is not typically changed while + * a task is runnable. + */ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { struct sched_dl_entity *dl_se = &p->dl; @@ -2574,6 +2782,9 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); + WARN_ON_ONCE(dl_rq->curr); + dl_rq->curr = dl_se; + if (!first) return; @@ -2637,17 +2848,20 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s struct sched_dl_entity *dl_se = &p->dl; struct dl_rq *dl_rq = &rq->dl; - if (on_dl_rq(&p->dl)) + if (on_dl_rq(dl_se)) update_stats_wait_start_dl(dl_rq, dl_se); update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); + WARN_ON_ONCE(dl_rq->curr != dl_se); + dl_rq->curr = NULL; + if (task_is_blocked(p)) return; - if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) + if (on_dl_rq(dl_se) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -3107,20 +3321,18 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_dl(struct task_struct *p, struct affinity_context *ctx) { - struct root_domain *src_rd; struct rq *rq; WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); - src_rd = rq->rd; /* * Migrating a SCHED_DEADLINE task between exclusive * cpusets (different root_domains) entails a bandwidth * update. We already made space for us in the destination * domain (see cpuset_can_attach()). */ - if (!cpumask_intersects(src_rd->span, ctx->new_mask)) { + if (dl_task_needs_bw_move(p, ctx->new_mask)) { struct dl_bw *src_dl_b; src_dl_b = dl_bw_of(cpu_of(rq)); @@ -3137,6 +3349,15 @@ static void set_cpus_allowed_dl(struct task_struct *p, set_cpus_allowed_common(p, ctx); } +bool dl_task_needs_bw_move(struct task_struct *p, + const struct cpumask *new_mask) +{ + if (!dl_task(p)) + return false; + + return !cpumask_intersects(task_rq(p)->rd->span, new_mask); +} + /* Assumes rq->lock is held */ static void rq_online_dl(struct rq *rq) { @@ -3229,12 +3450,12 @@ static void dl_server_add_bw(struct root_domain *rd, int cpu) struct sched_dl_entity *dl_se; dl_se = &cpu_rq(cpu)->fair_server; - if (dl_server(dl_se) && cpu_active(cpu)) + if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu)) __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu)); #ifdef CONFIG_SCHED_CLASS_EXT dl_se = &cpu_rq(cpu)->ext_server; - if (dl_server(dl_se) && cpu_active(cpu)) + if (dl_server(dl_se) && dl_se->dl_bw_attached && cpu_active(cpu)) __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(cpu)); #endif } @@ -3243,11 +3464,13 @@ static u64 dl_server_read_bw(int cpu) { u64 dl_bw = 0; - if (cpu_rq(cpu)->fair_server.dl_server) + if (cpu_rq(cpu)->fair_server.dl_server && + cpu_rq(cpu)->fair_server.dl_bw_attached) dl_bw += cpu_rq(cpu)->fair_server.dl_bw; #ifdef CONFIG_SCHED_CLASS_EXT - if (cpu_rq(cpu)->ext_server.dl_server) + if (cpu_rq(cpu)->ext_server.dl_server && + cpu_rq(cpu)->ext_server.dl_bw_attached) dl_bw += cpu_rq(cpu)->ext_server.dl_bw; #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 74c1617cf652..40584b27ea0c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -136,7 +136,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, if (cnt > 63) cnt = 63; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -210,6 +210,48 @@ static const struct file_operations sched_scaling_fops = { .release = single_release, }; +#ifdef CONFIG_SCHED_CACHE +static ssize_t +sched_cache_enable_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + bool val; + int ret; + + ret = kstrtobool_from_user(ubuf, cnt, &val); + if (ret) + return ret; + + sysctl_sched_cache_user = val; + + sched_cache_active_set(); + + *ppos += cnt; + + return cnt; +} + +static int sched_cache_enable_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", sysctl_sched_cache_user); + return 0; +} + +static int sched_cache_enable_open(struct inode *inode, + struct file *filp) +{ + return single_open(filp, sched_cache_enable_show, NULL); +} + +static const struct file_operations sched_cache_enable_fops = { + .open = sched_cache_enable_open, + .write = sched_cache_enable_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + #ifdef CONFIG_PREEMPT_DYNAMIC static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, @@ -221,7 +263,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, if (cnt > 15) cnt = 15; - if (copy_from_user(&buf, ubuf, cnt)) + if (copy_from_user(buf, ubuf, cnt)) return -EFAULT; buf[cnt] = 0; @@ -239,6 +281,7 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { int i = (IS_ENABLED(CONFIG_PREEMPT_RT) || IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY)) * 2; + int mode = READ_ONCE(preempt_dynamic_mode); int j; /* Count entries in NULL terminated preempt_modes */ @@ -247,10 +290,10 @@ static int sched_dynamic_show(struct seq_file *m, void *v) j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); for (; i < j; i++) { - if (preempt_dynamic_mode == i) + if (mode == i) seq_puts(m, "("); seq_puts(m, preempt_modes[i]); - if (preempt_dynamic_mode == i) + if (mode == i) seq_puts(m, ")"); seq_puts(m, " "); @@ -373,6 +416,9 @@ static ssize_t sched_server_write_common(struct file *filp, const char __user *u return -EINVAL; } + if (!cpu_online(cpu_of(rq))) + return -EBUSY; + update_rq_clock(rq); dl_server_stop(dl_se); retval = dl_server_apply_params(dl_se, runtime, period, 0); @@ -445,6 +491,8 @@ static const struct file_operations fair_server_runtime_fops = { .release = single_release, }; +static struct dentry *debugfs_sched; + #ifdef CONFIG_SCHED_CLASS_EXT static ssize_t sched_ext_server_runtime_write(struct file *filp, const char __user *ubuf, @@ -477,75 +525,92 @@ static const struct file_operations ext_server_runtime_fops = { .llseek = seq_lseek, .release = single_release, }; -#endif /* CONFIG_SCHED_CLASS_EXT */ static ssize_t -sched_fair_server_period_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +sched_ext_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { long cpu = (long) ((struct seq_file *) filp->private_data)->private; struct rq *rq = cpu_rq(cpu); return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, - &rq->fair_server); + &rq->ext_server); } -static int sched_fair_server_period_show(struct seq_file *m, void *v) +static int sched_ext_server_period_show(struct seq_file *m, void *v) { unsigned long cpu = (unsigned long) m->private; struct rq *rq = cpu_rq(cpu); - return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server); + return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server); } -static int sched_fair_server_period_open(struct inode *inode, struct file *filp) +static int sched_ext_server_period_open(struct inode *inode, struct file *filp) { - return single_open(filp, sched_fair_server_period_show, inode->i_private); + return single_open(filp, sched_ext_server_period_show, inode->i_private); } -static const struct file_operations fair_server_period_fops = { - .open = sched_fair_server_period_open, - .write = sched_fair_server_period_write, +static const struct file_operations ext_server_period_fops = { + .open = sched_ext_server_period_open, + .write = sched_ext_server_period_write, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; -#ifdef CONFIG_SCHED_CLASS_EXT +static void debugfs_ext_server_init(void) +{ + struct dentry *d_ext; + unsigned long cpu; + + d_ext = debugfs_create_dir("ext_server", debugfs_sched); + if (!d_ext) + return; + + for_each_possible_cpu(cpu) { + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%lu", cpu); + d_cpu = debugfs_create_dir(buf, d_ext); + + debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops); + debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops); + } +} +#endif /* CONFIG_SCHED_CLASS_EXT */ + static ssize_t -sched_ext_server_period_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +sched_fair_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { long cpu = (long) ((struct seq_file *) filp->private_data)->private; struct rq *rq = cpu_rq(cpu); return sched_server_write_common(filp, ubuf, cnt, ppos, DL_PERIOD, - &rq->ext_server); + &rq->fair_server); } -static int sched_ext_server_period_show(struct seq_file *m, void *v) +static int sched_fair_server_period_show(struct seq_file *m, void *v) { unsigned long cpu = (unsigned long) m->private; struct rq *rq = cpu_rq(cpu); - return sched_server_show_common(m, v, DL_PERIOD, &rq->ext_server); + return sched_server_show_common(m, v, DL_PERIOD, &rq->fair_server); } -static int sched_ext_server_period_open(struct inode *inode, struct file *filp) +static int sched_fair_server_period_open(struct inode *inode, struct file *filp) { - return single_open(filp, sched_ext_server_period_show, inode->i_private); + return single_open(filp, sched_fair_server_period_show, inode->i_private); } -static const struct file_operations ext_server_period_fops = { - .open = sched_ext_server_period_open, - .write = sched_ext_server_period_write, +static const struct file_operations fair_server_period_fops = { + .open = sched_fair_server_period_open, + .write = sched_fair_server_period_write, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; -#endif /* CONFIG_SCHED_CLASS_EXT */ - -static struct dentry *debugfs_sched; static void debugfs_fair_server_init(void) { @@ -568,32 +633,9 @@ static void debugfs_fair_server_init(void) } } -#ifdef CONFIG_SCHED_CLASS_EXT -static void debugfs_ext_server_init(void) -{ - struct dentry *d_ext; - unsigned long cpu; - - d_ext = debugfs_create_dir("ext_server", debugfs_sched); - if (!d_ext) - return; - - for_each_possible_cpu(cpu) { - struct dentry *d_cpu; - char buf[32]; - - snprintf(buf, sizeof(buf), "cpu%lu", cpu); - d_cpu = debugfs_create_dir(buf, d_ext); - - debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &ext_server_runtime_fops); - debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &ext_server_period_fops); - } -} -#endif /* CONFIG_SCHED_CLASS_EXT */ - static __init int sched_init_debug(void) { - struct dentry __maybe_unused *numa; + struct dentry __maybe_unused *numa, *llc; debugfs_sched = debugfs_create_dir("sched", NULL); @@ -626,6 +668,22 @@ static __init int sched_init_debug(void) debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_SCHED_CACHE + llc = debugfs_create_dir("llc_balancing", debugfs_sched); + debugfs_create_file("enabled", 0644, llc, NULL, + &sched_cache_enable_fops); + debugfs_create_u32("aggr_tolerance", 0644, llc, + &llc_aggr_tolerance); + debugfs_create_u32("epoch_period", 0644, llc, + &llc_epoch_period); + debugfs_create_u32("epoch_affinity_timeout", 0644, llc, + &llc_epoch_affinity_timeout); + debugfs_create_u32("overaggr_pct", 0644, llc, + &llc_overaggr_pct); + debugfs_create_u32("imb_pct", 0644, llc, + &llc_imb_pct); +#endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); debugfs_fair_server_init(); @@ -750,7 +808,7 @@ void dirty_sched_domain_sysctl(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { - struct sched_entity *se = tg->se[cpu]; + struct sched_entity *se = tg_se(tg, cpu); #define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) #define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", \ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 345aa11b84b2..0db6fa2daea3 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6,8 +6,6 @@ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> * Copyright (c) 2022 David Vernet <dvernet@meta.com> */ -#include <linux/btf_ids.h> -#include "ext_idle.h" static DEFINE_RAW_SPINLOCK(scx_sched_lock); @@ -38,6 +36,15 @@ static const struct rhashtable_params scx_sched_hash_params = { static struct rhashtable scx_sched_hash; #endif +/* see SCX_OPS_TID_TO_TASK */ +static const struct rhashtable_params scx_tid_hash_params = { + .key_len = sizeof_field(struct sched_ext_entity, tid), + .key_offset = offsetof(struct sched_ext_entity, tid), + .head_offset = offsetof(struct sched_ext_entity, tid_hash_node), + .insecure_elasticity = true, /* inserted/removed under scx_tasks_lock */ +}; +static struct rhashtable scx_tid_hash; + /* * During exit, a task may schedule after losing its PIDs. When disabling the * BPF scheduler, we need to be able to iterate tasks in every state to @@ -56,10 +63,25 @@ static DEFINE_RAW_SPINLOCK(scx_bypass_lock); static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); +static DEFINE_STATIC_KEY_FALSE(__scx_tid_to_task_enabled); + +/* + * True once SCX_OPS_TID_TO_TASK has been negotiated with the root scheduler + * and the tid->task table is live. Wraps the static key so callers don't + * take the address, and hints "likely enabled" for the common case where + * the feature is in use. + */ +static inline bool scx_tid_to_task_enabled(void) +{ + return static_branch_likely(&__scx_tid_to_task_enabled); +} static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); +/* Global cursor for the per-CPU tid allocator. Starts at 1; tid 0 is reserved. */ +static atomic64_t scx_tid_cursor = ATOMIC64_INIT(1); + #ifdef CONFIG_EXT_SUB_SCHED /* * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit @@ -109,6 +131,17 @@ struct scx_kick_syncs { static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); /* + * Per-CPU buffered allocator state for p->scx.tid. Each CPU pulls a chunk of + * SCX_TID_CHUNK ids from scx_tid_cursor and hands them out locally without + * further synchronization. See scx_alloc_tid(). + */ +struct scx_tid_alloc { + u64 next; + u64 end; +}; +static DEFINE_PER_CPU(struct scx_tid_alloc, scx_tid_alloc); + +/* * Direct dispatch marker. * * Non-NULL values are used for direct dispatch from enqueue path. A valid @@ -198,26 +231,21 @@ static void run_deferred(struct rq *rq); static bool task_dead_and_done(struct task_struct *p); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); -static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, - s64 exit_code, const char *fmt, va_list args); -static __printf(4, 5) bool scx_exit(struct scx_sched *sch, - enum scx_exit_kind kind, s64 exit_code, - const char *fmt, ...) +__printf(5, 6) bool __scx_exit(struct scx_sched *sch, + enum scx_exit_kind kind, s64 exit_code, + s32 exit_cpu, const char *fmt, ...) { va_list args; bool ret; va_start(args, fmt); - ret = scx_vexit(sch, kind, exit_code, fmt, args); + ret = scx_vexit(sch, kind, exit_code, exit_cpu, fmt, args); va_end(args); return ret; } -#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) -#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args) - #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) static long jiffies_delta_msecs(unsigned long at, unsigned long now) @@ -295,10 +323,9 @@ static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) rcu_assign_pointer(p->scx.sched, sch); } #else /* CONFIG_EXT_SUB_SCHED */ -static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } -static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } -static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { return NULL; } -static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} +static inline struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } +static inline struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } +static inline void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} #endif /* CONFIG_EXT_SUB_SCHED */ /** @@ -485,6 +512,33 @@ do { \ update_locked_rq(__prev_locked_rq); \ } while (0) +/* + * Flipped on enable per sch->is_cid_type. Declared in ext_internal.h so + * subsystem inlines can read it. + */ +DEFINE_STATIC_KEY_FALSE(__scx_is_cid_type); + +/* + * scx_cpu_arg() wraps a cpu arg being handed to an SCX op. For cid-form + * schedulers it resolves to the matching cid; for cpu-form it passes @cpu + * through. scx_cpu_ret() is the inverse for a cpu/cid returned from an op + * (currently only ops.select_cpu); it validates the BPF-supplied cid and + * triggers scx_error() on @sch if invalid. + */ +static s32 scx_cpu_arg(s32 cpu) +{ + if (scx_is_cid_type()) + return __scx_cpu_to_cid(cpu); + return cpu; +} + +static s32 scx_cpu_ret(struct scx_sched *sch, s32 cpu_or_cid) +{ + if (cpu_or_cid < 0 || !scx_is_cid_type()) + return cpu_or_cid; + return scx_cid_to_cpu(sch, cpu_or_cid); +} + #define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ ({ \ struct rq *__prev_locked_rq; \ @@ -546,6 +600,44 @@ do { \ __ret; \ }) +/** + * scx_call_op_set_cpumask - invoke ops.set_cpumask / ops_cid.set_cmask for @task + * @sch: scx_sched being invoked + * @rq: rq to update as the currently-locked rq, or NULL + * @task: task whose affinity is changing + * @cpumask: new cpumask + * + * For cid-form schedulers, translate @cpumask to a cmask via the per-cpu + * scratch in ext_cid.c and dispatch through the ops_cid union view. Caller + * must hold @rq's rq lock so this_cpu_ptr is stable across the call. + */ +static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq, + struct task_struct *task, + const struct cpumask *cpumask) +{ + WARN_ON_ONCE(current->scx.kf_tasks[0]); + current->scx.kf_tasks[0] = task; + if (rq) + update_locked_rq(rq); + + if (scx_is_cid_type()) { + struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch); + /* + * Build the per-CPU arena cmask and hand BPF its arena address. + * Caller holds the rq lock with IRQs disabled, which makes us + * the sole user of the scratch area. + */ + scx_cpumask_to_cmask(cpumask, kern_va); + sch->ops_cid.set_cmask(task, scx_kaddr_to_arena(sch, kern_va)); + } else { + sch->ops.set_cpumask(task, cpumask); + } + + if (rq) + update_locked_rq(NULL); + current->scx.kf_tasks[0] = NULL; +} + /* see SCX_CALL_OP_TASK() */ static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch, struct task_struct *p) @@ -712,6 +804,51 @@ struct bpf_iter_scx_dsq { } __attribute__((aligned(8))); +static u32 scx_get_task_state(const struct task_struct *p) +{ + return p->scx.flags & SCX_TASK_STATE_MASK; +} + +static void scx_set_task_state(struct task_struct *p, u32 state) +{ + u32 prev_state = scx_get_task_state(p); + bool warn = false; + + switch (state) { + case SCX_TASK_NONE: + warn = prev_state == SCX_TASK_DEAD; + break; + case SCX_TASK_INIT_BEGIN: + warn = prev_state != SCX_TASK_NONE; + break; + case SCX_TASK_INIT: + warn = prev_state != SCX_TASK_INIT_BEGIN; + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; + break; + case SCX_TASK_READY: + warn = !(prev_state == SCX_TASK_INIT || + prev_state == SCX_TASK_ENABLED); + break; + case SCX_TASK_ENABLED: + warn = prev_state != SCX_TASK_READY; + break; + case SCX_TASK_DEAD: + warn = !(prev_state == SCX_TASK_NONE || + prev_state == SCX_TASK_INIT_BEGIN); + break; + default: + WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", + prev_state, state, p->comm, p->pid); + return; + } + + WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", + prev_state, state, p->comm, p->pid); + + p->scx.flags &= ~SCX_TASK_STATE_MASK; + p->scx.flags |= state; +} + /* * SCX task iterator. */ @@ -766,7 +903,8 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); iter->cgrp = cgrp; iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); - css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, + &iter->css_iter); return; } #endif @@ -813,6 +951,24 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) } /** + * scx_task_iter_relock - Re-acquire scx_tasks_lock and, optionally, @p's rq + * @iter: iterator to relock + * @p: task whose rq to lock, or %NULL for scx_tasks_lock only + * + * Counterpart to scx_task_iter_unlock(). Locking @p's rq is optional. Once + * re-acquired, both locks are managed by the iterator from here on. + */ +static void scx_task_iter_relock(struct scx_task_iter *iter, + struct task_struct *p) +{ + __scx_task_iter_maybe_relock(iter); + if (p) { + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked_task = p; + } +} + +/** * scx_task_iter_stop - Stop a task iteration and unlock scx_tasks_lock * @iter: iterator to exit * @@ -866,7 +1022,8 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) iter->css_pos = css_next_descendant_pre(iter->css_pos, &iter->cgrp->self); if (iter->css_pos) - css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + css_task_iter_start(iter->css_pos, CSS_TASK_ITER_WITH_DEAD, + &iter->css_iter); } return NULL; } @@ -926,16 +1083,27 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * * Test for idle_sched_class as only init_tasks are on it. */ - if (p->sched_class != &idle_sched_class) - break; - } - if (!p) - return NULL; + if (p->sched_class == &idle_sched_class) + continue; - iter->rq = task_rq_lock(p, &iter->rf); - iter->locked_task = p; + iter->rq = task_rq_lock(p, &iter->rf); + iter->locked_task = p; - return p; + /* + * cgroup_task_dead() removes the dead tasks from cset->tasks + * after sched_ext_dead() and cgroup iteration may see tasks + * which already finished sched_ext_dead(). %SCX_TASK_DEAD is + * set by sched_ext_dead() under @p's rq lock. Test it to + * avoid visiting tasks which are already dead from SCX POV. + */ + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + __scx_task_iter_rq_unlock(iter); + continue; + } + + return p; + } + return NULL; } /** @@ -1029,7 +1197,7 @@ static inline bool __cpu_valid(s32 cpu) } /** - * ops_cpu_valid - Verify a cpu number, to be used on ops input args + * scx_cpu_valid - Verify a cpu number, to be used on ops input args * @sch: scx_sched to abort on error * @cpu: cpu number which came from a BPF ops * @where: extra information reported on error @@ -1038,7 +1206,7 @@ static inline bool __cpu_valid(s32 cpu) * Verify that it is in range and one of the possible cpus. If invalid, trigger * an ops error. */ -static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) +bool scx_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) { if (__cpu_valid(cpu)) { return true; @@ -1685,9 +1853,9 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, return &rq->scx.local_dsq; if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { - s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK); - if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) + if (!scx_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) return find_global_dsq(sch, tcpu); return &cpu_rq(cpu)->scx.local_dsq; @@ -2021,6 +2189,7 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) /* dequeue is always temporary, don't reset runnable_at */ clr_task_runnable(p, false); +retry: /* acquire ensures that we see the preceding updates on QUEUED */ opss = atomic_long_read_acquire(&p->scx.ops_state); @@ -2034,8 +2203,20 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) */ BUG(); case SCX_OPSS_QUEUED: - /* A queued task must always be in BPF scheduler's custody */ - WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY)); + /* + * A queued task must always be in BPF scheduler's custody. If + * SCX_TASK_IN_CUSTODY is clear, finish_dispatch() on another + * CPU has already passed call_task_dequeue() (which clears the + * flag), but has not yet written SCX_OPSS_NONE. That final + * store does not require this rq's lock, so retrying with + * cpu_relax() is bounded: we will observe NONE (or DISPATCHING, + * handled by the fallthrough) on a subsequent iteration. + */ + if (unlikely(!(READ_ONCE(p->scx.flags) & SCX_TASK_IN_CUSTODY))) { + cpu_relax(); + goto retry; + } + if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, SCX_OPSS_NONE)) break; @@ -2767,11 +2948,13 @@ scx_dispatch_sched(struct scx_sched *sch, struct rq *rq, dspc->nr_tasks = 0; if (nested) { - SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); + SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu), + prev_on_sch ? prev : NULL); } else { /* stash @prev so that nested invocations can access it */ rq->scx.sub_dispatch_prev = prev; - SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); + SCX_CALL_OP(sch, dispatch, rq, scx_cpu_arg(cpu), + prev_on_sch ? prev : NULL); rq->scx.sub_dispatch_prev = NULL; } @@ -2829,7 +3012,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * core. This callback complements ->cpu_release(), which is * emitted in switch_class(). */ - if (SCX_HAS_OP(sch, cpu_acquire)) + if (sch->ops.cpu_acquire) SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL); rq->scx.cpu_released = false; } @@ -2975,7 +3158,7 @@ static void switch_class(struct rq *rq, struct task_struct *next) * next time that balance_one() is invoked. */ if (!rq->scx.cpu_released) { - if (SCX_HAS_OP(sch, cpu_release)) { + if (sch->ops.cpu_release) { struct scx_cpu_release_args args = { .reason = preempt_reason_from_class(next_class), .task = next, @@ -3266,11 +3449,13 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag *ddsp_taskp = p; this_rq()->scx.in_select_cpu = true; - cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, prev_cpu, wake_flags); + cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, + scx_cpu_arg(prev_cpu), wake_flags); + cpu = scx_cpu_ret(sch, cpu); this_rq()->scx.in_select_cpu = false; p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; - if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) + if (scx_cpu_valid(sch, cpu, "from ops.select_cpu()")) return cpu; else return prev_cpu; @@ -3316,7 +3501,7 @@ static void set_cpus_allowed_scx(struct task_struct *p, * designation pointless. Cast it away when calling the operation. */ if (SCX_HAS_OP(sch, set_cpumask)) - SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr); + scx_call_op_set_cpumask(sch, task_rq(p), p, (struct cpumask *)p->cpus_ptr); } static void handle_hotplug(struct rq *rq, bool online) @@ -3338,9 +3523,9 @@ static void handle_hotplug(struct rq *rq, bool online) scx_idle_update_selcpu_topology(&sch->ops); if (online && SCX_HAS_OP(sch, cpu_online)) - SCX_CALL_OP(sch, cpu_online, NULL, cpu); + SCX_CALL_OP(sch, cpu_online, NULL, scx_cpu_arg(cpu)); else if (!online && SCX_HAS_OP(sch, cpu_offline)) - SCX_CALL_OP(sch, cpu_offline, NULL, cpu); + SCX_CALL_OP(sch, cpu_offline, NULL, scx_cpu_arg(cpu)); else scx_exit(sch, SCX_EXIT_UNREG_KERN, SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, @@ -3388,9 +3573,10 @@ static bool check_rq_for_timeouts(struct rq *rq) last_runnable + READ_ONCE(sch->watchdog_timeout)))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); - scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, - "%s[%d] failed to run for %u.%03us", - p->comm, p->pid, dur_ms / 1000, dur_ms % 1000); + __scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, cpu_of(rq), + "%s[%d] failed to run for %u.%03us", + p->comm, p->pid, dur_ms / 1000, + dur_ms % 1000); timed_out = true; break; } @@ -3487,41 +3673,6 @@ static struct cgroup *tg_cgrp(struct task_group *tg) #endif /* CONFIG_EXT_GROUP_SCHED */ -static u32 scx_get_task_state(const struct task_struct *p) -{ - return p->scx.flags & SCX_TASK_STATE_MASK; -} - -static void scx_set_task_state(struct task_struct *p, u32 state) -{ - u32 prev_state = scx_get_task_state(p); - bool warn = false; - - switch (state) { - case SCX_TASK_NONE: - break; - case SCX_TASK_INIT: - warn = prev_state != SCX_TASK_NONE; - break; - case SCX_TASK_READY: - warn = prev_state == SCX_TASK_NONE; - break; - case SCX_TASK_ENABLED: - warn = prev_state != SCX_TASK_READY; - break; - default: - WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", - prev_state, state, p->comm, p->pid); - return; - } - - WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", - prev_state, state, p->comm, p->pid); - - p->scx.flags &= ~SCX_TASK_STATE_MASK; - p->scx.flags |= state; -} - static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) { int ret; @@ -3573,22 +3724,6 @@ static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fo return 0; } -static int scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) -{ - int ret; - - ret = __scx_init_task(sch, p, fork); - if (!ret) { - /* - * While @p's rq is not locked. @p is not visible to the rest of - * SCX yet and it's safe to update the flags and state. - */ - p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; - scx_set_task_state(p, SCX_TASK_INIT); - } - return ret; -} - static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) { struct rq *rq = task_rq(p); @@ -3703,7 +3838,8 @@ static void scx_disable_and_exit_task(struct scx_sched *sch, * If set, @p exited between __scx_init_task() and scx_enable_task() in * scx_sub_enable() and is initialized for both the associated sched and * its parent. Exit for the child too - scx_enable_task() never ran for - * it, so undo only init_task. + * it, so undo only init_task. The flag is only set on the sub-enable + * path, so it's always clear when @p arrives here in %SCX_TASK_NONE. */ if (p->scx.flags & SCX_TASK_SUB_INIT) { if (!WARN_ON_ONCE(!scx_enabling_sub_sched)) @@ -3728,6 +3864,33 @@ void init_scx_entity(struct sched_ext_entity *scx) scx->slice = SCX_SLICE_DFL; } +/* See scx_tid_alloc / scx_tid_cursor. */ +static u64 scx_alloc_tid(void) +{ + struct scx_tid_alloc *ta; + + guard(preempt)(); + ta = this_cpu_ptr(&scx_tid_alloc); + + if (unlikely(ta->next >= ta->end)) { + ta->next = atomic64_fetch_add(SCX_TID_CHUNK, &scx_tid_cursor); + ta->end = ta->next + SCX_TID_CHUNK; + } + return ta->next++; +} + +static void scx_tid_hash_insert(struct task_struct *p) +{ + int ret; + + lockdep_assert_held(&scx_tasks_lock); + + ret = rhashtable_lookup_insert_fast(&scx_tid_hash, + &p->scx.tid_hash_node, + scx_tid_hash_params); + WARN_ON_ONCE(ret); +} + void scx_pre_fork(struct task_struct *p) { /* @@ -3745,16 +3908,22 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) percpu_rwsem_assert_held(&scx_fork_rwsem); + p->scx.tid = scx_alloc_tid(); + if (scx_init_task_enabled) { #ifdef CONFIG_EXT_SUB_SCHED struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; #else struct scx_sched *sch = scx_root; #endif - ret = scx_init_task(sch, p, true); - if (!ret) - scx_set_task_sched(p, sch); - return ret; + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); + ret = __scx_init_task(sch, p, true); + if (unlikely(ret)) { + scx_set_task_state(p, SCX_TASK_NONE); + return ret; + } + scx_set_task_state(p, SCX_TASK_INIT); + scx_set_task_sched(p, sch); } return 0; @@ -3780,9 +3949,11 @@ void scx_post_fork(struct task_struct *p) } } - raw_spin_lock_irq(&scx_tasks_lock); - list_add_tail(&p->scx.tasks_node, &scx_tasks); - raw_spin_unlock_irq(&scx_tasks_lock); + scoped_guard(raw_spinlock_irq, &scx_tasks_lock) { + list_add_tail(&p->scx.tasks_node, &scx_tasks); + if (scx_tid_to_task_enabled()) + scx_tid_hash_insert(p); + } percpu_up_read(&scx_fork_rwsem); } @@ -3833,28 +4004,41 @@ static bool task_dead_and_done(struct task_struct *p) void sched_ext_dead(struct task_struct *p) { - unsigned long flags; - /* * By the time control reaches here, @p has %TASK_DEAD set, switched out * for the last time and then dropped the rq lock - task_dead_and_done() * should be returning %true nullifying the straggling sched_class ops. * Remove from scx_tasks and exit @p. */ - raw_spin_lock_irqsave(&scx_tasks_lock, flags); - list_del_init(&p->scx.tasks_node); - raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); + scoped_guard(raw_spinlock_irqsave, &scx_tasks_lock) { + list_del_init(&p->scx.tasks_node); + if (scx_tid_to_task_enabled()) + rhashtable_remove_fast(&scx_tid_hash, + &p->scx.tid_hash_node, + scx_tid_hash_params); + } /* * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> * ENABLED transitions can't race us. Disable ops for @p. + * + * %SCX_TASK_DEAD synchronizes against cgroup task iteration - see + * scx_task_iter_next_locked(). NONE tasks need no marking: cgroup + * iteration is only used from sub-sched paths, which require root + * enabled. Root enable transitions every live task to at least READY. + * + * %INIT_BEGIN means ops.init_task() is running for @p. Don't call + * into ops; transition to %DEAD so the post-init recheck unwinds + * via scx_sub_init_cancel_task(). */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - scx_disable_and_exit_task(scx_task_sched(p), p); + if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN) + scx_disable_and_exit_task(scx_task_sched(p), p); + scx_set_task_state(p, SCX_TASK_DEAD); task_rq_unlock(rq, p, &rf); } } @@ -3892,7 +4076,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p) * different scheduler class. Keep the BPF scheduler up-to-date. */ if (SCX_HAS_OP(sch, set_cpumask)) - SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr); + scx_call_op_set_cpumask(sch, rq, p, (struct cpumask *)p->cpus_ptr); } static void switched_from_scx(struct rq *rq, struct task_struct *p) @@ -3900,6 +4084,16 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) if (task_dead_and_done(p)) return; + /* + * %NONE means SCX is no longer tracking @p at the task level (e.g. + * scx_fail_parent() handed @p back to the parent at NONE pending the + * parent's own teardown). There is nothing to disable; calling + * scx_disable_task() would WARN on the non-%ENABLED state and trigger a + * NONE -> READY validation failure. + */ + if (scx_get_task_state(p) == SCX_TASK_NONE) + return; + scx_disable_task(scx_task_sched(p), p); } @@ -4357,11 +4551,13 @@ void scx_cgroup_move_task(struct task_struct *p) return; /* - * @p must have ops.cgroup_prep_move() called on it and thus - * cgrp_moving_from set. + * scx_cgroup_can_attach() sets cgrp_moving_from only when the task's + * cgroup changes. Migration keys off css rather than cgroup identity, + * so it can hand an unchanged-cgroup task here with cgrp_moving_from + * NULL. Nothing to report to the BPF scheduler then, so skip it and + * keep prep_move and move paired. */ - if (SCX_HAS_OP(sch, cgroup_move) && - !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) + if (SCX_HAS_OP(sch, cgroup_move) && p->scx.cgrp_moving_from) SCX_CALL_OP_TASK(sch, cgroup_move, task_rq(p), p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); @@ -4463,9 +4659,9 @@ static void scx_cgroup_unlock(void) #endif } #else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ -static struct cgroup *root_cgroup(void) { return NULL; } -static void scx_cgroup_lock(void) {} -static void scx_cgroup_unlock(void) {} +static inline struct cgroup *root_cgroup(void) { return NULL; } +static inline void scx_cgroup_lock(void) {} +static inline void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ #ifdef CONFIG_EXT_SUB_SCHED @@ -4484,8 +4680,8 @@ static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) rcu_assign_pointer(pos->scx_sched, sch); } #else /* CONFIG_EXT_SUB_SCHED */ -static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } -static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} +static inline struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } +static inline void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} #endif /* CONFIG_EXT_SUB_SCHED */ /* @@ -4771,6 +4967,48 @@ static const struct attribute_group scx_global_attr_group = { static void free_pnode(struct scx_sched_pnode *pnode); static void free_exit_info(struct scx_exit_info *ei); +static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch) +{ + size_t size = struct_size_t(struct scx_cmask, bits, + SCX_CMASK_NR_WORDS(num_possible_cpus())); + int cpu; + + if (!sch->is_cid_type || !sch->arena_pool) + return 0; + + sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *); + if (!sch->set_cmask_scratch) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu); + + *slot = scx_arena_alloc(sch, size); + if (!*slot) + return -ENOMEM; + scx_cmask_init(*slot, 0, num_possible_cpus()); + } + return 0; +} + +static void scx_set_cmask_scratch_free(struct scx_sched *sch) +{ + size_t size = struct_size_t(struct scx_cmask, bits, + SCX_CMASK_NR_WORDS(num_possible_cpus())); + int cpu; + + if (!sch->set_cmask_scratch) + return; + + for_each_possible_cpu(cpu) { + struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu); + + scx_arena_free(sch, *slot, size); + } + free_percpu(sch->set_cmask_scratch); + sch->set_cmask_scratch = NULL; +} + static void scx_sched_free_rcu_work(struct work_struct *work) { struct rcu_work *rcu_work = to_rcu_work(work); @@ -4789,6 +5027,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work) kfree(sch->cgrp_path); if (sch_cgroup(sch)) cgroup_put(sch_cgroup(sch)); + if (sch->sub_kset) + kobject_put(&sch->sub_kset->kobj); #endif /* CONFIG_EXT_SUB_SCHED */ for_each_possible_cpu(cpu) { @@ -4823,6 +5063,10 @@ static void scx_sched_free_rcu_work(struct work_struct *work) rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); free_exit_info(sch->exit_info); + scx_set_cmask_scratch_free(sch); + scx_arena_pool_destroy(sch); + if (sch->arena_map) + bpf_map_put(sch->arena_map); kfree(sch); } @@ -4912,10 +5156,30 @@ static const struct kset_uevent_ops scx_uevent_ops = { */ bool task_should_scx(int policy) { - if (!scx_enabled() || unlikely(scx_enable_state() == SCX_DISABLING)) + /* if disabled, nothing should be on it */ + if (!scx_enabled()) return false; + + /* scx is taking over all SCHED_OTHER and SCHED_EXT tasks */ if (READ_ONCE(scx_switching_all)) return true; + + /* + * scx is tearing down - keep new SCHED_EXT tasks out. + * + * Must come after scx_switching_all test, which serves as a proxy + * for __scx_switched_all. While __scx_switched_all is set, we must + * return true via the branch above: a fork routed to fair would + * stall because next_active_class() skips fair. + * + * This can develop into a deadlock - scx holds scx_enable_mutex across + * kthread_create() in scx_alloc_and_add_sched(); if the new kthread is + * the stalled task, the disable path can never grab the mutex to clear + * scx_switching_all. + */ + if (unlikely(scx_enable_state() == SCX_DISABLING)) + return false; + return policy == SCHED_EXT; } @@ -5494,6 +5758,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) if (!ei) return NULL; + ei->exit_cpu = -1; ei->bt = kzalloc_objs(ei->bt[0], SCX_EXIT_BT_LEN); ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); @@ -5566,10 +5831,12 @@ static void refresh_watchdog(void) static s32 scx_link_sched(struct scx_sched *sch) { + const char *err_msg = ""; + s32 ret = 0; + scoped_guard(raw_spinlock_irq, &scx_sched_lock) { #ifdef CONFIG_EXT_SUB_SCHED struct scx_sched *parent = scx_parent(sch); - s32 ret; if (parent) { /* @@ -5579,15 +5846,16 @@ static s32 scx_link_sched(struct scx_sched *sch) * parent can shoot us down. */ if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { - scx_error(sch, "parent disabled"); - return -ENOENT; + err_msg = "parent disabled"; + ret = -ENOENT; + break; } ret = rhashtable_lookup_insert_fast(&scx_sched_hash, &sch->hash_node, scx_sched_hash_params); if (ret) { - scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret); - return ret; + err_msg = "failed to insert into scx_sched_hash"; + break; } list_add_tail(&sch->sibling, &parent->children); @@ -5597,6 +5865,15 @@ static s32 scx_link_sched(struct scx_sched *sch) list_add_tail_rcu(&sch->all, &scx_sched_all); } + /* + * scx_error() takes scx_sched_lock via scx_claim_exit(), so it must run after + * the guard above is released. + */ + if (ret) { + scx_error(sch, "%s (%d)", err_msg, ret); + return ret; + } + refresh_watchdog(); return 0; } @@ -5628,6 +5905,26 @@ static void scx_disable_dump(struct scx_sched *sch) sch->dump_disabled = true; } +static void scx_log_sched_disable(struct scx_sched *sch) +{ + struct scx_exit_info *ei = sch->exit_info; + const char *type = scx_parent(sch) ? "sub-scheduler" : "scheduler"; + + if (ei->kind >= SCX_EXIT_ERROR) { + pr_err("sched_ext: BPF %s \"%s\" disabled (%s)\n", type, + sch->ops.name, ei->reason); + + if (ei->msg[0] != '\0') + pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); +#ifdef CONFIG_STACKTRACE + stack_trace_print(ei->bt, ei->bt_len, 2); +#endif + } else { + pr_info("sched_ext: BPF %s \"%s\" disabled (%s)\n", type, + sch->ops.name, ei->reason); + } +} + #ifdef CONFIG_EXT_SUB_SCHED static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); @@ -5666,7 +5963,7 @@ static void scx_fail_parent(struct scx_sched *sch, scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { scx_disable_and_exit_task(sch, p); - rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_sched(p, parent); } } scx_task_iter_stop(&sti); @@ -5714,14 +6011,11 @@ static void scx_sub_disable(struct scx_sched *sch) WARN_ON_ONCE(!scx_task_on_sched(sch, p)); /* - * If $p is about to be freed, nothing prevents $sch from - * unloading before $p reaches sched_ext_free(). Disable and - * exit $p right away. + * @p is pinned by the iter: css_task_iter_next() takes a + * reference and holds it until the next iter_next() call, so + * @p->usage is guaranteed > 0. */ - if (!tryget_task_struct(p)) { - scx_disable_and_exit_task(sch, p); - continue; - } + get_task_struct(p); scx_task_iter_unlock(&sti); @@ -5744,6 +6038,21 @@ static void scx_sub_disable(struct scx_sched *sch) } rq = task_rq_lock(p, &rf); + + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() raced us between __scx_init_task() + * and this rq lock and ran exit_task() on @sch (the + * sched @p was on at that point), not on $parent. + * $parent's just-completed init is owed an exit_task() + * and we issue it here. + */ + scx_sub_init_cancel_task(parent, p); + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + continue; + } + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { /* * $p is initialized for $parent and still attached to @@ -5752,13 +6061,14 @@ static void scx_sub_disable(struct scx_sched *sch) * $p having already been initialized, and then enable. */ scx_disable_and_exit_task(sch, p); + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); scx_set_task_state(p, SCX_TASK_INIT); - rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_sched(p, parent); scx_set_task_state(p, SCX_TASK_READY); scx_enable_task(parent, p); } - task_rq_unlock(rq, p, &rf); + task_rq_unlock(rq, p, &rf); put_task_struct(p); } scx_task_iter_stop(&sti); @@ -5798,22 +6108,24 @@ static void scx_sub_disable(struct scx_sched *sch) &sub_detach_args); } + scx_log_sched_disable(sch); + if (sch->ops.exit) SCX_CALL_OP(sch, exit, NULL, sch->exit_info); if (sch->sub_kset) - kset_unregister(sch->sub_kset); + kobject_del(&sch->sub_kset->kobj); kobject_del(&sch->kobj); } #else /* CONFIG_EXT_SUB_SCHED */ -static void drain_descendants(struct scx_sched *sch) { } -static void scx_sub_disable(struct scx_sched *sch) { } +static inline void drain_descendants(struct scx_sched *sch) { } +static inline void scx_sub_disable(struct scx_sched *sch) { } #endif /* CONFIG_EXT_SUB_SCHED */ static void scx_root_disable(struct scx_sched *sch) { - struct scx_exit_info *ei = sch->exit_info; struct scx_task_iter sti; struct task_struct *p; + bool was_switched_all; int cpu; /* guarantee forward progress and wait for descendants to be disabled */ @@ -5840,6 +6152,8 @@ static void scx_root_disable(struct scx_sched *sch) */ mutex_lock(&scx_enable_mutex); + was_switched_all = scx_switched_all(); + static_branch_disable(&__scx_switched_all); WRITE_ONCE(scx_switching_all, false); @@ -5889,34 +6203,51 @@ static void scx_root_disable(struct scx_sched *sch) /* * Invalidate all the rq clocks to prevent getting outdated * rq clocks from a previous scx scheduler. + * + * Also re-balance the dl_server bandwidth reservations: detach + * ext_server (no more sched_ext tasks) and reinstate fair_server if it + * was previously detached because we were running in full mode. + * + * Unlike the enable path, this runs on a recovery path that cannot + * fail, so we use dl_server_swap_bw() to atomically free ext_server's + * bandwidth and reclaim it for fair_server under the same dl_b lock. + * + * The swap can still fail with -EBUSY if someone bumped ext_server's + * runtime via debugfs between enable and disable; in that narrow case + * both servers end up detached and we just WARN. */ for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); + scx_rq_clock_invalidate(rq); + + scoped_guard(rq_lock_irqsave, rq) { + update_rq_clock(rq); + if (was_switched_all) { + if (WARN_ON_ONCE(dl_server_swap_bw(&rq->ext_server, + &rq->fair_server))) + pr_warn("failed to re-attach fair_server on CPU %d\n", cpu); + } else { + dl_server_detach_bw(&rq->ext_server); + } + } } /* no task is on scx, turn off all the switches and flush in-progress calls */ static_branch_disable(&__scx_enabled); + static_branch_disable(&__scx_is_cid_type); + if (sch->ops.flags & SCX_OPS_TID_TO_TASK) + static_branch_disable(&__scx_tid_to_task_enabled); bitmap_zero(sch->has_op, SCX_OPI_END); scx_idle_disable(); synchronize_rcu(); + if (sch->ops.flags & SCX_OPS_TID_TO_TASK) + rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL); - if (ei->kind >= SCX_EXIT_ERROR) { - pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", - sch->ops.name, ei->reason); - - if (ei->msg[0] != '\0') - pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg); -#ifdef CONFIG_STACKTRACE - stack_trace_print(ei->bt, ei->bt_len, 2); -#endif - } else { - pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", - sch->ops.name, ei->reason); - } + scx_log_sched_disable(sch); if (sch->ops.exit) - SCX_CALL_OP(sch, exit, NULL, ei); + SCX_CALL_OP(sch, exit, NULL, sch->exit_info); scx_unlink_sched(sch); @@ -5935,7 +6266,7 @@ static void scx_root_disable(struct scx_sched *sch) */ #ifdef CONFIG_EXT_SUB_SCHED if (sch->sub_kset) - kset_unregister(sch->sub_kset); + kobject_del(&sch->sub_kset->kobj); #endif kobject_del(&sch->kobj); @@ -6214,6 +6545,94 @@ static void scx_dump_task(struct scx_sched *sch, struct seq_buf *s, struct scx_d } } +static void scx_dump_cpu(struct scx_sched *sch, struct seq_buf *s, + struct scx_dump_ctx *dctx, int cpu, + bool dump_all_tasks) +{ + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + struct task_struct *p; + struct seq_buf ns; + size_t avail, used; + char *buf; + bool idle; + + rq_lock_irqsave(rq, &rf); + + idle = list_empty(&rq->scx.runnable_list) && + rq->curr->sched_class == &idle_sched_class; + + if (idle && !SCX_HAS_OP(sch, dump_cpu)) + goto next; + + /* + * We don't yet know whether ops.dump_cpu() will produce output + * and we may want to skip the default CPU dump if it doesn't. + * Use a nested seq_buf to generate the standard dump so that we + * can decide whether to commit later. + */ + avail = seq_buf_get_buf(s, &buf); + seq_buf_init(&ns, buf, avail); + + dump_newline(&ns); + dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", + cpu, rq->scx.nr_running, rq->scx.flags, + rq->scx.cpu_released, rq->scx.ops_qseq, + rq->scx.kick_sync); + dump_line(&ns, " curr=%s[%d] class=%ps", + rq->curr->comm, rq->curr->pid, + rq->curr->sched_class); + if (!cpumask_empty(rq->scx.cpus_to_kick)) + dump_line(&ns, " cpus_to_kick : %*pb", + cpumask_pr_args(rq->scx.cpus_to_kick)); + if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) + dump_line(&ns, " idle_to_kick : %*pb", + cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); + if (!cpumask_empty(rq->scx.cpus_to_preempt)) + dump_line(&ns, " cpus_to_preempt: %*pb", + cpumask_pr_args(rq->scx.cpus_to_preempt)); + if (!cpumask_empty(rq->scx.cpus_to_wait)) + dump_line(&ns, " cpus_to_wait : %*pb", + cpumask_pr_args(rq->scx.cpus_to_wait)); + if (!cpumask_empty(rq->scx.cpus_to_sync)) + dump_line(&ns, " cpus_to_sync : %*pb", + cpumask_pr_args(rq->scx.cpus_to_sync)); + + used = seq_buf_used(&ns); + if (SCX_HAS_OP(sch, dump_cpu)) { + ops_dump_init(&ns, " "); + SCX_CALL_OP(sch, dump_cpu, rq, dctx, scx_cpu_arg(cpu), idle); + ops_dump_exit(); + } + + /* + * If idle && nothing generated by ops.dump_cpu(), there's + * nothing interesting. Skip. + */ + if (idle && used == seq_buf_used(&ns)) + goto next; + + /* + * $s may already have overflowed when $ns was created. If so, + * calling commit on it will trigger BUG. + */ + if (avail) { + seq_buf_commit(s, seq_buf_used(&ns)); + if (seq_buf_has_overflowed(&ns)) + seq_buf_set_overflow(s); + } + + if (rq->curr->sched_class == &ext_sched_class && + (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) + scx_dump_task(sch, s, dctx, rq, rq->curr, '*'); + + list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) + if (dump_all_tasks || scx_task_on_sched(sch, p)) + scx_dump_task(sch, s, dctx, rq, p, ' '); +next: + rq_unlock_irqrestore(rq, &rf); +} + /* * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless * of which scheduler they belong to. If false, only dump tasks owned by @sch. @@ -6234,7 +6653,6 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, }; struct seq_buf s; struct scx_event_stats events; - char *buf; int cpu; guard(raw_spinlock_irqsave)(&scx_dump_lock); @@ -6255,8 +6673,13 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, if (ei->kind == SCX_EXIT_NONE) { dump_line(&s, "Debug dump triggered by %s", ei->reason); } else { - dump_line(&s, "%s[%d] triggered exit kind %d:", - current->comm, current->pid, ei->kind); + if (ei->exit_cpu >= 0) + dump_line(&s, "%s[%d] triggered exit kind %d on CPU %d:", + current->comm, current->pid, ei->kind, + ei->exit_cpu); + else + dump_line(&s, "%s[%d] triggered exit kind %d:", + current->comm, current->pid, ei->kind); dump_line(&s, " %s (%s)", ei->reason, ei->msg); dump_newline(&s); dump_line(&s, "Backtrace:"); @@ -6273,88 +6696,15 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, dump_line(&s, "CPU states"); dump_line(&s, "----------"); + /* + * Dump the exit CPU first so it isn't lost to dump truncation, then + * walk the rest in order, skipping the one already dumped. + */ + if (ei->exit_cpu >= 0) + scx_dump_cpu(sch, &s, &dctx, ei->exit_cpu, dump_all_tasks); for_each_possible_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - struct task_struct *p; - struct seq_buf ns; - size_t avail, used; - bool idle; - - rq_lock_irqsave(rq, &rf); - - idle = list_empty(&rq->scx.runnable_list) && - rq->curr->sched_class == &idle_sched_class; - - if (idle && !SCX_HAS_OP(sch, dump_cpu)) - goto next; - - /* - * We don't yet know whether ops.dump_cpu() will produce output - * and we may want to skip the default CPU dump if it doesn't. - * Use a nested seq_buf to generate the standard dump so that we - * can decide whether to commit later. - */ - avail = seq_buf_get_buf(&s, &buf); - seq_buf_init(&ns, buf, avail); - - dump_newline(&ns); - dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu ksync=%lu", - cpu, rq->scx.nr_running, rq->scx.flags, - rq->scx.cpu_released, rq->scx.ops_qseq, - rq->scx.kick_sync); - dump_line(&ns, " curr=%s[%d] class=%ps", - rq->curr->comm, rq->curr->pid, - rq->curr->sched_class); - if (!cpumask_empty(rq->scx.cpus_to_kick)) - dump_line(&ns, " cpus_to_kick : %*pb", - cpumask_pr_args(rq->scx.cpus_to_kick)); - if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) - dump_line(&ns, " idle_to_kick : %*pb", - cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); - if (!cpumask_empty(rq->scx.cpus_to_preempt)) - dump_line(&ns, " cpus_to_preempt: %*pb", - cpumask_pr_args(rq->scx.cpus_to_preempt)); - if (!cpumask_empty(rq->scx.cpus_to_wait)) - dump_line(&ns, " cpus_to_wait : %*pb", - cpumask_pr_args(rq->scx.cpus_to_wait)); - if (!cpumask_empty(rq->scx.cpus_to_sync)) - dump_line(&ns, " cpus_to_sync : %*pb", - cpumask_pr_args(rq->scx.cpus_to_sync)); - - used = seq_buf_used(&ns); - if (SCX_HAS_OP(sch, dump_cpu)) { - ops_dump_init(&ns, " "); - SCX_CALL_OP(sch, dump_cpu, rq, &dctx, cpu, idle); - ops_dump_exit(); - } - - /* - * If idle && nothing generated by ops.dump_cpu(), there's - * nothing interesting. Skip. - */ - if (idle && used == seq_buf_used(&ns)) - goto next; - - /* - * $s may already have overflowed when $ns was created. If so, - * calling commit on it will trigger BUG. - */ - if (avail) { - seq_buf_commit(&s, seq_buf_used(&ns)); - if (seq_buf_has_overflowed(&ns)) - seq_buf_set_overflow(&s); - } - - if (rq->curr->sched_class == &ext_sched_class && - (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) - scx_dump_task(sch, &s, &dctx, rq, rq->curr, '*'); - - list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) - if (dump_all_tasks || scx_task_on_sched(sch, p)) - scx_dump_task(sch, &s, &dctx, rq, p, ' '); - next: - rq_unlock_irqrestore(rq, &rf); + if (cpu != ei->exit_cpu) + scx_dump_cpu(sch, &s, &dctx, cpu, dump_all_tasks); } dump_newline(&s); @@ -6392,9 +6742,9 @@ static void scx_disable_irq_workfn(struct irq_work *irq_work) kthread_queue_work(sch->helper, &sch->disable_work); } -static bool scx_vexit(struct scx_sched *sch, - enum scx_exit_kind kind, s64 exit_code, - const char *fmt, va_list args) +bool scx_vexit(struct scx_sched *sch, + enum scx_exit_kind kind, s64 exit_code, s32 exit_cpu, + const char *fmt, va_list args) { struct scx_exit_info *ei = sch->exit_info; @@ -6416,6 +6766,7 @@ static bool scx_vexit(struct scx_sched *sch, */ ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); + ei->exit_cpu = exit_cpu; irq_work_queue(&sch->disable_irq_work); return true; @@ -6473,13 +6824,32 @@ static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) } /* + * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid + * starvation. During the READY -> ENABLED task switching loop, the calling + * thread's sched_class gets switched from fair to ext. As fair has higher + * priority than ext, the calling thread can be indefinitely starved under + * fair-class saturation, leading to a system hang. + */ +struct scx_enable_cmd { + struct kthread_work work; + union { + struct sched_ext_ops *ops; + struct sched_ext_ops_cid *ops_cid; + }; + bool is_cid_type; + struct bpf_map *arena_map; /* arena ref to transfer to sch */ + int ret; +}; + +/* * Allocate and initialize a new scx_sched. @cgrp's reference is always * consumed whether the function succeeds or fails. */ -static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, +static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd, struct cgroup *cgrp, struct scx_sched *parent) { + struct sched_ext_ops *ops = cmd->ops; struct scx_sched *sch; s32 level = parent ? parent->level + 1 : 0; s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; @@ -6559,7 +6929,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, sch->slice_dfl = SCX_SLICE_DFL; atomic_set(&sch->exit_kind, SCX_EXIT_NONE); - init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn); + sch->disable_irq_work = IRQ_WORK_INIT_HARD(scx_disable_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); @@ -6571,10 +6941,22 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, ret = -ENOMEM; goto err_free_lb_cpumask; } - sch->ops = *ops; + /* + * Copy ops through the right union view. For cid-form the source is + * struct sched_ext_ops_cid which lacks the trailing cpu_acquire/ + * cpu_release; those stay zero from kzalloc. + */ + if (cmd->is_cid_type) { + sch->ops_cid = *cmd->ops_cid; + sch->is_cid_type = true; + } else { + sch->ops = *cmd->ops; + } + rcu_assign_pointer(ops->priv, sch); sch->kobj.kset = scx_kset; + INIT_LIST_HEAD(&sch->all); #ifdef CONFIG_EXT_SUB_SCHED char *buf = kzalloc(PATH_MAX, GFP_KERNEL); @@ -6602,6 +6984,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); if (ret < 0) { + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); return ERR_PTR(ret); } @@ -6609,6 +6992,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, if (ops->sub_attach) { sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); if (!sch->sub_kset) { + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); return ERR_PTR(-ENOMEM); } @@ -6616,14 +7000,32 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, #else /* CONFIG_EXT_SUB_SCHED */ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); if (ret < 0) { + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); return ERR_PTR(ret); } #endif /* CONFIG_EXT_SUB_SCHED */ + + /* + * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so + * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid + * drops the ref. After this point, sch owns the ref and any cleanup + * runs through scx_sched_free_rcu_work() which puts it. + */ + sch->arena_map = cmd->arena_map; + /* BPF arena is only available on MMU && 64BIT */ +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) + if (sch->arena_map) + sch->arena_kern_base = bpf_arena_map_kern_vm_start(sch->arena_map); +#endif + cmd->arena_map = NULL; return sch; +#ifdef CONFIG_EXT_SUB_SCHED err_free_lb_resched: + RCU_INIT_POINTER(ops->priv, NULL); free_cpumask_var(sch->bypass_lb_resched_cpumask); +#endif err_free_lb_cpumask: free_cpumask_var(sch->bypass_lb_donee_cpumask); err_stop_helper: @@ -6688,6 +7090,17 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) } /* + * SCX_OPS_TID_TO_TASK is enabled by the root scheduler. A sub-sched + * may set it to declare a dependency; reject if the root hasn't + * enabled it. + */ + if ((ops->flags & SCX_OPS_TID_TO_TASK) && scx_parent(sch) && + !(scx_root->ops.flags & SCX_OPS_TID_TO_TASK)) { + scx_error(sch, "SCX_OPS_TID_TO_TASK requires root scheduler to enable it"); + return -EINVAL; + } + + /* * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle * selection policy to be enabled. */ @@ -6697,25 +7110,34 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) return -EINVAL; } - if (ops->cpu_acquire || ops->cpu_release) + /* + * cid-form's struct is shorter and doesn't include the cpu_acquire / + * cpu_release tail; reading those fields off a cid-form @ops would + * run past the BPF allocation. Skip for cid-form. + */ + if (!sch->is_cid_type && (ops->cpu_acquire || ops->cpu_release)) pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); + /* + * Sub-scheduler support is tied to the cid-form struct_ops. A sub-sched + * attaches through a cid-form-only interface (sub_attach/sub_detach), + * and a root that accepts sub-scheds must expose cid-form state to + * them. Reject cpu-form schedulers on either side. + */ + if (!sch->is_cid_type) { + if (scx_parent(sch)) { + scx_error(sch, "sub-sched requires cid-form struct_ops"); + return -EINVAL; + } + if (ops->sub_attach || ops->sub_detach) { + scx_error(sch, "sub_attach/sub_detach requires cid-form struct_ops"); + return -EINVAL; + } + } + return 0; } -/* - * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid - * starvation. During the READY -> ENABLED task switching loop, the calling - * thread's sched_class gets switched from fair to ext. As fair has higher - * priority than ext, the calling thread can be indefinitely starved under - * fair-class saturation, leading to a system hang. - */ -struct scx_enable_cmd { - struct kthread_work work; - struct sched_ext_ops *ops; - int ret; -}; - static void scx_root_enable_workfn(struct kthread_work *work) { struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); @@ -6733,19 +7155,41 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_unlock; } + /* + * @ops->priv binds @ops to its scx_sched instance. It is set here by + * scx_alloc_and_add_sched() and cleared at the tail of bpf_scx_unreg(), + * which runs after scx_root_disable() has dropped scx_enable_mutex. If + * it's still non-NULL here, a previous attachment on @ops has not + * finished tearing down; proceeding would let the in-flight unreg's + * RCU_INIT_POINTER(NULL) clobber the @ops->priv we are about to assign. + */ + if (rcu_access_pointer(ops->priv)) { + ret = -EBUSY; + goto err_unlock; + } + ret = alloc_kick_syncs(); if (ret) goto err_unlock; + if (ops->flags & SCX_OPS_TID_TO_TASK) { + ret = rhashtable_init(&scx_tid_hash, &scx_tid_hash_params); + if (ret) + goto err_free_ksyncs; + } + #ifdef CONFIG_EXT_SUB_SCHED cgroup_get(cgrp); #endif - sch = scx_alloc_and_add_sched(ops, cgrp, NULL); + sch = scx_alloc_and_add_sched(cmd, cgrp, NULL); if (IS_ERR(sch)) { ret = PTR_ERR(sch); - goto err_free_ksyncs; + goto err_free_tid_hash; } + if (sch->is_cid_type) + static_branch_enable(&__scx_is_cid_type); + /* * Transition to ENABLING and clear exit info to arm the disable path. * Failure triggers full disabling from here on. @@ -6769,6 +7213,18 @@ static void scx_root_enable_workfn(struct kthread_work *work) cpus_read_lock(); /* + * Build the cid mapping before publishing scx_root. The cid kfuncs + * dereference the cid arrays unconditionally once scx_prog_sched() + * returns non-NULL; the rcu_assign_pointer() below pairs with their + * rcu_dereference() to make the populated arrays visible. + */ + ret = scx_cid_init(sch); + if (ret) { + cpus_read_unlock(); + goto err_disable; + } + + /* * Make the scheduler instance visible. Must be inside cpus_read_lock(). * See handle_hotplug(). */ @@ -6793,6 +7249,18 @@ static void scx_root_enable_workfn(struct kthread_work *work) sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; } + ret = scx_arena_pool_init(sch); + if (ret) { + cpus_read_unlock(); + goto err_disable; + } + + ret = scx_set_cmask_scratch_alloc(sch); + if (ret) { + cpus_read_unlock(); + goto err_disable; + } + for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) if (((void (**)(void))ops)[i]) set_bit(i, sch->has_op); @@ -6811,6 +7279,31 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_disable; /* + * Attach the ext_server bandwidth reservation before anything is + * committed so that we can fail the enable if the root domain cannot + * accommodate it. The matching fair_server detach is deferred to the + * tail of this function, after the switch is fully committed and can no + * longer fail. + * + * On failure, err_disable funnels into scx_root_disable() which + * detaches ext_server, so partially-attached state is cleaned up + * automatically. + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + scoped_guard(rq_lock_irqsave, rq) { + update_rq_clock(rq); + ret = dl_server_attach_bw(&rq->ext_server); + } + if (ret) { + pr_warn("sched_ext: failed to attach ext_server on CPU %d (%d)\n", + cpu, ret); + goto err_disable; + } + } + + /* * Once __scx_enabled is set, %current can be switched to SCX anytime. * This can lead to stalls as some BPF schedulers (e.g. userspace * scheduling) may not function correctly before all tasks are switched. @@ -6834,6 +7327,10 @@ static void scx_root_enable_workfn(struct kthread_work *work) WARN_ON_ONCE(scx_init_task_enabled); scx_init_task_enabled = true; + /* flip under fork_rwsem; the iter below covers existing tasks */ + if (ops->flags & SCX_OPS_TID_TO_TASK) + static_branch_enable(&__scx_tid_to_task_enabled); + /* * Enable ops for every task. Fork is excluded by scx_fork_rwsem * preventing new tasks from being added. No need to exclude tasks @@ -6856,26 +7353,60 @@ static void scx_root_enable_workfn(struct kthread_work *work) scx_task_iter_start(&sti, NULL); while ((p = scx_task_iter_next_locked(&sti))) { /* - * @p may already be dead, have lost all its usages counts and - * be waiting for RCU grace period before being freed. @p can't - * be initialized for SCX in such cases and should be ignored. + * @p is in scx_tasks under scx_tasks_lock, and SCX_TASK_DEAD + * tasks are filtered by scx_task_iter_next_locked(). + * sched_ext_dead() removes @p from scx_tasks under the same + * lock before put_task_struct_rcu_user() runs, so @p->usage + * is guaranteed > 0 here. */ - if (!tryget_task_struct(p)) - continue; + get_task_struct(p); + /* + * Set %INIT_BEGIN under the iter's rq lock so that a concurrent + * sched_ext_dead() does not call ops.exit_task() on @p while + * ops.init_task() is running. If sched_ext_dead() runs before + * this store, it has already removed @p from scx_tasks and the + * iter won't visit @p; if it runs after, it observes + * %INIT_BEGIN and transitions to %DEAD without calling ops, + * leaving the post-init recheck below to unwind. + */ + scx_set_task_state(p, SCX_TASK_INIT_BEGIN); scx_task_iter_unlock(&sti); - ret = scx_init_task(sch, p, false); - if (ret) { - put_task_struct(p); + ret = __scx_init_task(sch, p, false); + + scx_task_iter_relock(&sti, p); + + if (unlikely(ret)) { + if (scx_get_task_state(p) != SCX_TASK_DEAD) + scx_set_task_state(p, SCX_TASK_NONE); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); + put_task_struct(p); goto err_disable_unlock_all; } - scx_set_task_sched(p, sch); - scx_set_task_state(p, SCX_TASK_READY); + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() observed %INIT_BEGIN and set %DEAD. + * ops.exit_task() is owed to the sched __scx_init_task() + * ran against; call it now. + */ + scx_sub_init_cancel_task(sch, p); + } else { + scx_set_task_state(p, SCX_TASK_INIT); + scx_set_task_sched(p, sch); + scx_set_task_state(p, SCX_TASK_READY); + } + + /* + * Insert into the tid hash. scx_tasks_lock is held by the iter; + * list_empty() guards against sched_ext_dead() having taken @p + * off the list while init ran unlocked. + */ + if (scx_tid_to_task_enabled() && !list_empty(&p->scx.tasks_node)) + scx_tid_hash_insert(p); put_task_struct(p); } @@ -6926,6 +7457,25 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) static_branch_enable(&__scx_switched_all); + /* + * Detach the fair_server bandwidth reservation now that the switch + * is fully committed. In full mode (!SCX_OPS_SWITCH_PARTIAL) no + * task will ever run in the fair class, so give that bandwidth + * back to the RT class. The matching ext_server attach already + * happened earlier; this only releases bandwidth and cannot fail. + * + * In partial mode keep fair_server attached. + */ + if (scx_switched_all()) { + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + guard(rq_lock_irqsave)(rq); + update_rq_clock(rq); + dl_server_detach_bw(&rq->fair_server); + } + } + pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", sch->ops.name, scx_switched_all() ? "" : " (partial)"); kobject_uevent(&sch->kobj, KOBJ_ADD); @@ -6936,6 +7486,9 @@ static void scx_root_enable_workfn(struct kthread_work *work) cmd->ret = 0; return; +err_free_tid_hash: + if (ops->flags & SCX_OPS_TID_TO_TASK) + rhashtable_free_and_destroy(&scx_tid_hash, NULL, NULL); err_free_ksyncs: free_kick_syncs(); err_unlock: @@ -7020,6 +7573,12 @@ static void scx_sub_enable_workfn(struct kthread_work *work) goto out_unlock; } + /* See scx_root_enable_workfn() for the @ops->priv check. */ + if (rcu_access_pointer(ops->priv)) { + ret = -EBUSY; + goto out_unlock; + } + cgrp = cgroup_get_from_id(ops->sub_cgroup_id); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); @@ -7037,7 +7596,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work) raw_spin_unlock_irq(&scx_sched_lock); /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */ - sch = scx_alloc_and_add_sched(ops, cgrp, parent); + sch = scx_alloc_and_add_sched(cmd, cgrp, parent); kobject_put(&parent->kobj); if (IS_ERR(sch)) { ret = PTR_ERR(sch); @@ -7064,6 +7623,14 @@ static void scx_sub_enable_workfn(struct kthread_work *work) sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; } + ret = scx_arena_pool_init(sch); + if (ret) + goto err_disable; + + ret = scx_set_cmask_scratch_alloc(sch); + if (ret) + goto err_disable; + if (validate_ops(sch, ops)) goto err_disable; @@ -7126,9 +7693,8 @@ static void scx_sub_enable_workfn(struct kthread_work *work) if (p->scx.flags & SCX_TASK_SUB_INIT) continue; - /* see scx_root_enable() */ - if (!tryget_task_struct(p)) - continue; + /* @p is pinned by the iter; see scx_sub_disable() */ + get_task_struct(p); if (!assert_task_ready_or_enabled(p)) { ret = -EINVAL; @@ -7146,6 +7712,21 @@ static void scx_sub_enable_workfn(struct kthread_work *work) goto abort; rq = task_rq_lock(p, &rf); + + if (scx_get_task_state(p) == SCX_TASK_DEAD) { + /* + * sched_ext_dead() raced us between __scx_init_task() + * and this rq lock and ran exit_task() on $parent (the + * sched @p was on at that point), not on @sch. @sch's + * just-completed init is owed an exit_task() and we + * issue it here. + */ + scx_sub_init_cancel_task(sch, p); + task_rq_unlock(rq, p, &rf); + put_task_struct(p); + continue; + } + p->scx.flags |= SCX_TASK_SUB_INIT; task_rq_unlock(rq, p, &rf); @@ -7180,7 +7761,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work) * $p is now only initialized for @sch and READY, which * is what we want. Assign it to @sch and enable. */ - rcu_assign_pointer(p->scx.sched, sch); + scx_set_task_sched(p, sch); scx_enable_task(sch, p); p->scx.flags &= ~SCX_TASK_SUB_INIT; @@ -7276,14 +7857,12 @@ static s32 __init scx_cgroup_lifetime_notifier_init(void) core_initcall(scx_cgroup_lifetime_notifier_init); #endif /* CONFIG_EXT_SUB_SCHED */ -static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) +static s32 scx_enable(struct scx_enable_cmd *cmd, struct bpf_link *link) { static struct kthread_worker *helper; static DEFINE_MUTEX(helper_mutex); - struct scx_enable_cmd cmd; - if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), - cpu_possible_mask)) { + if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT)) { pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); return -EINVAL; } @@ -7304,16 +7883,15 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) } #ifdef CONFIG_EXT_SUB_SCHED - if (ops->sub_cgroup_id > 1) - kthread_init_work(&cmd.work, scx_sub_enable_workfn); + if (cmd->ops->sub_cgroup_id > 1) + kthread_init_work(&cmd->work, scx_sub_enable_workfn); else #endif /* CONFIG_EXT_SUB_SCHED */ - kthread_init_work(&cmd.work, scx_root_enable_workfn); - cmd.ops = ops; + kthread_init_work(&cmd->work, scx_root_enable_workfn); - kthread_queue_work(READ_ONCE(helper), &cmd.work); - kthread_flush_work(&cmd.work); - return cmd.ret; + kthread_queue_work(READ_ONCE(helper), &cmd->work); + kthread_flush_work(&cmd->work); + return cmd->ret; } @@ -7485,7 +8063,62 @@ static int bpf_scx_check_member(const struct btf_type *t, static int bpf_scx_reg(void *kdata, struct bpf_link *link) { - return scx_enable(kdata, link); + struct scx_enable_cmd cmd = { .ops = kdata }; + + return scx_enable(&cmd, link); +} + +struct scx_arena_scan { + struct bpf_map *arena; + int err; +}; + +/* + * The verifier enforces one arena per BPF program, so each struct_ops + * member prog contributes at most one arena via bpf_prog_arena(). + * Require all non-NULL contributions to match. + */ +static int scx_arena_scan_prog(struct bpf_prog *prog, void *data) +{ + struct scx_arena_scan *s = data; + struct bpf_map *arena = NULL; + + /* arena.o, which defines these, is built only on MMU && 64BIT */ +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) + arena = bpf_prog_arena(prog); +#endif + if (!arena) + return 0; + if (s->arena && s->arena != arena) { + s->err = -EINVAL; + return 1; + } + s->arena = arena; + return 0; +} + +static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link) +{ + struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true }; + struct scx_arena_scan scan = {}; + int ret; + + bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan); + if (scan.err) { + pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n"); + return scan.err; + } + if (!scan.arena) { + pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n"); + return -EINVAL; + } + + bpf_map_inc(scan.arena); + cmd.arena_map = scan.arena; + ret = scx_enable(&cmd, link); + if (cmd.arena_map) /* not consumed by scx_alloc_and_add_sched() */ + bpf_map_put(cmd.arena_map); + return ret; } static void bpf_scx_unreg(void *kdata, struct bpf_link *link) @@ -7619,6 +8252,73 @@ static struct bpf_struct_ops bpf_sched_ext_ops = { .cfi_stubs = &__bpf_ops_sched_ext_ops }; +/* + * cid-form cfi stubs. Stubs whose signatures match the cpu-form (param types + * identical, only param names differ across structs) are reused; only + * set_cmask needs a fresh stub since the second argument type differs. + */ +static void sched_ext_ops_cid__set_cmask(struct task_struct *p, + const struct scx_cmask *cmask) {} + +static struct sched_ext_ops_cid __bpf_ops_sched_ext_ops_cid = { + .select_cid = sched_ext_ops__select_cpu, + .enqueue = sched_ext_ops__enqueue, + .dequeue = sched_ext_ops__dequeue, + .dispatch = sched_ext_ops__dispatch, + .tick = sched_ext_ops__tick, + .runnable = sched_ext_ops__runnable, + .running = sched_ext_ops__running, + .stopping = sched_ext_ops__stopping, + .quiescent = sched_ext_ops__quiescent, + .yield = sched_ext_ops__yield, + .core_sched_before = sched_ext_ops__core_sched_before, + .set_weight = sched_ext_ops__set_weight, + .set_cmask = sched_ext_ops_cid__set_cmask, + .update_idle = sched_ext_ops__update_idle, + .init_task = sched_ext_ops__init_task, + .exit_task = sched_ext_ops__exit_task, + .enable = sched_ext_ops__enable, + .disable = sched_ext_ops__disable, +#ifdef CONFIG_EXT_GROUP_SCHED + .cgroup_init = sched_ext_ops__cgroup_init, + .cgroup_exit = sched_ext_ops__cgroup_exit, + .cgroup_prep_move = sched_ext_ops__cgroup_prep_move, + .cgroup_move = sched_ext_ops__cgroup_move, + .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, + .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, + .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, + .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, +#endif + .sub_attach = sched_ext_ops__sub_attach, + .sub_detach = sched_ext_ops__sub_detach, + .cid_online = sched_ext_ops__cpu_online, + .cid_offline = sched_ext_ops__cpu_offline, + .init = sched_ext_ops__init, + .exit = sched_ext_ops__exit, + .dump = sched_ext_ops__dump, + .dump_cid = sched_ext_ops__dump_cpu, + .dump_task = sched_ext_ops__dump_task, +}; + +/* + * The cid-form struct_ops shares all bpf_struct_ops hooks with the cpu form. + * init_member, check_member, reg, unreg, etc. process kdata as the byte block + * verified to match by the BUILD_BUG_ON checks in scx_init(). + */ +static struct bpf_struct_ops bpf_sched_ext_ops_cid = { + .verifier_ops = &bpf_scx_verifier_ops, + .reg = bpf_scx_reg_cid, + .unreg = bpf_scx_unreg, + .check_member = bpf_scx_check_member, + .init_member = bpf_scx_init_member, + .init = bpf_scx_init, + .update = bpf_scx_update, + .validate = bpf_scx_validate, + .name = "sched_ext_ops_cid", + .owner = THIS_MODULE, + .cfi_stubs = &__bpf_ops_sched_ext_ops_cid +}; + /******************************************************************************** * System integration and init. @@ -7628,13 +8328,11 @@ static void sysrq_handle_sched_ext_reset(u8 key) { struct scx_sched *sch; - rcu_read_lock(); sch = rcu_dereference(scx_root); if (likely(sch)) scx_disable(sch, SCX_EXIT_SYSRQ); else pr_info("sched_ext: BPF schedulers not loaded\n"); - rcu_read_unlock(); } static const struct sysrq_key_op sysrq_sched_ext_reset_op = { @@ -7646,7 +8344,11 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = { static void sysrq_handle_sched_ext_dump(u8 key) { - struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; + struct scx_exit_info ei = { + .kind = SCX_EXIT_NONE, + .exit_cpu = -1, + .reason = "SysRq-D", + }; struct scx_sched *sch; list_for_each_entry_rcu(sch, &scx_sched_all, all) @@ -8716,9 +9418,6 @@ static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) struct rq *this_rq; unsigned long irq_flags; - if (!ops_cpu_valid(sch, cpu, NULL)) - return; - local_irq_save(irq_flags); this_rq = this_rq(); @@ -8781,11 +9480,36 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux guard(rcu)(); sch = scx_prog_sched(aux); - if (likely(sch)) + if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) scx_kick_cpu(sch, cpu, flags); } /** + * scx_bpf_kick_cid - Trigger reschedule on the CPU mapped to @cid + * @cid: cid to kick + * @flags: %SCX_KICK_* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * cid-addressed equivalent of scx_bpf_kick_cpu(). Return 0 on success, + * -errno otherwise. + */ +__bpf_kfunc s32 scx_bpf_kick_cid(s32 cid, u64 flags, const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + s32 cpu; + + guard(rcu)(); + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return -ENODEV; + cpu = scx_cid_to_cpu(sch, cid); + if (cpu < 0) + return cpu; + scx_kick_cpu(sch, cpu, flags); + return 0; +} + +/** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs @@ -8811,9 +9535,9 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id, const struct bpf_prog_aux *aux ret = READ_ONCE(this_rq()->scx.local_dsq.nr); goto out; } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { - s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; + s32 cpu = scx_cpu_ret(sch, dsq_id & SCX_DSQ_LOCAL_CPU_MASK); - if (ops_cpu_valid(sch, cpu, NULL)) { + if (scx_cpu_valid(sch, cpu, NULL)) { ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); goto out; } @@ -9031,6 +9755,7 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) __bpf_kfunc_end_defs(); +__printf(5, 0) static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, size_t line_size, char *fmt, unsigned long long *data, u32 data__sz) @@ -9068,6 +9793,7 @@ static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, return ret; } +__printf(3, 0) static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, char *fmt, unsigned long long *data, u32 data__sz) { @@ -9088,6 +9814,7 @@ __bpf_kfunc_start_defs(); * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops * disabling. */ +__printf(2, 0) __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz, const struct bpf_prog_aux *aux) @@ -9113,6 +9840,7 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, * Indicate that the BPF scheduler encountered a fatal error and initiate ops * disabling. */ +__printf(1, 0) __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz, const struct bpf_prog_aux *aux) { @@ -9140,6 +9868,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, * The extra dump may be multiple lines. A single line may be split over * multiple calls. The last line is automatically terminated. */ +__printf(1, 0) __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data__sz, const struct bpf_prog_aux *aux) { @@ -9202,13 +9931,36 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux) guard(rcu)(); sch = scx_prog_sched(aux); - if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) + if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) return arch_scale_cpu_capacity(cpu); else return SCX_CPUPERF_ONE; } /** + * scx_bpf_cidperf_cap - Query the maximum relative capacity of the CPU at @cid + * @cid: cid of the CPU to query + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * cid-addressed equivalent of scx_bpf_cpuperf_cap(). + */ +__bpf_kfunc u32 scx_bpf_cidperf_cap(s32 cid, const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + s32 cpu; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return SCX_CPUPERF_ONE; + cpu = scx_cid_to_cpu(sch, cid); + if (cpu < 0) + return SCX_CPUPERF_ONE; + return arch_scale_cpu_capacity(cpu); +} + +/** * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU * @cpu: CPU of interest * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs @@ -9230,13 +9982,36 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux) guard(rcu)(); sch = scx_prog_sched(aux); - if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) + if (likely(sch) && scx_cpu_valid(sch, cpu, NULL)) return arch_scale_freq_capacity(cpu); else return SCX_CPUPERF_ONE; } /** + * scx_bpf_cidperf_cur - Query the current performance of the CPU at @cid + * @cid: cid of the CPU to query + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * cid-addressed equivalent of scx_bpf_cpuperf_cur(). + */ +__bpf_kfunc u32 scx_bpf_cidperf_cur(s32 cid, const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + s32 cpu; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return SCX_CPUPERF_ONE; + cpu = scx_cid_to_cpu(sch, cid); + if (cpu < 0) + return SCX_CPUPERF_ONE; + return arch_scale_freq_capacity(cpu); +} + +/** * scx_bpf_cpuperf_set - Set the relative performance target of a CPU * @cpu: CPU of interest * @perf: target performance level [0, %SCX_CPUPERF_ONE] @@ -9266,7 +10041,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_au return; } - if (ops_cpu_valid(sch, cpu, NULL)) { + if (scx_cpu_valid(sch, cpu, NULL)) { struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); struct rq_flags rf; @@ -9297,6 +10072,31 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_au } /** + * scx_bpf_cidperf_set - Set the performance target of the CPU at @cid + * @cid: cid of the CPU to target + * @perf: target performance level [0, %SCX_CPUPERF_ONE] + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * cid-addressed equivalent of scx_bpf_cpuperf_set(). + */ +__bpf_kfunc void scx_bpf_cidperf_set(s32 cid, u32 perf, + const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + s32 cpu; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return; + cpu = scx_cid_to_cpu(sch, cid); + if (cpu < 0) + return; + scx_bpf_cpuperf_set(cpu, perf, aux); +} + +/** * scx_bpf_nr_node_ids - Return the number of possible node IDs * * All valid node IDs in the system are smaller than the returned value. @@ -9317,6 +10117,47 @@ __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) } /** + * scx_bpf_nr_cids - Return the size of the cid space + * + * Equals num_possible_cpus(). All valid cids are in [0, return value). + */ +__bpf_kfunc u32 scx_bpf_nr_cids(void) +{ + return num_possible_cpus(); +} + +/** + * scx_bpf_nr_online_cids - Return current count of online CPUs in cid space + * + * Return num_online_cpus(). The standard model restarts the scheduler on + * hotplug, which lets schedulers treat [0, nr_online_cids) as the online + * range. Schedulers that prefer to handle hotplug without a restart should + * install a custom mapping via scx_bpf_cid_override() and track onlining + * through the ops.cid_online / ops.cid_offline callbacks. + */ +__bpf_kfunc u32 scx_bpf_nr_online_cids(void) +{ + return num_online_cpus(); +} + +/** + * scx_bpf_this_cid - Return the cid of the CPU this program is running on + * + * cid-addressed equivalent of bpf_get_smp_processor_id() for scx programs. + * The current cpu is trivially valid, so this is just a table lookup. Return + * -EINVAL if called from a non-SCX program before any scheduler has ever + * been enabled (the cid table is still unallocated at that point). + */ +__bpf_kfunc s32 scx_bpf_this_cid(void) +{ + s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl); + + if (!tbl) + return -EINVAL; + return tbl[raw_smp_processor_id()]; +} + +/** * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask */ __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) @@ -9365,6 +10206,23 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) } /** + * scx_bpf_task_cid - cid a task is currently associated with + * @p: task of interest + * + * cid-addressed equivalent of scx_bpf_task_cpu(). task_cpu(p) is always a + * valid cpu, so this is just a table lookup. Return -EINVAL if called from + * a non-SCX program before any scheduler has ever been enabled. + */ +__bpf_kfunc s32 scx_bpf_task_cid(const struct task_struct *p) +{ + s16 *tbl = READ_ONCE(scx_cpu_to_cid_tbl); + + if (!tbl) + return -EINVAL; + return tbl[task_cpu(p)]; +} + +/** * scx_bpf_cpu_rq - Fetch the rq of a CPU * @cpu: CPU of the rq * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs @@ -9379,7 +10237,7 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux) if (unlikely(!sch)) return NULL; - if (!ops_cpu_valid(sch, cpu, NULL)) + if (!scx_cpu_valid(sch, cpu, NULL)) return NULL; if (!sch->warned_deprecated_rq) { @@ -9436,13 +10294,65 @@ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_ if (unlikely(!sch)) return NULL; - if (!ops_cpu_valid(sch, cpu, NULL)) + if (!scx_cpu_valid(sch, cpu, NULL)) return NULL; return rcu_dereference(cpu_rq(cpu)->curr); } /** + * scx_bpf_cid_curr - Return the curr task on the CPU at @cid + * @cid: cid of interest + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * cid-addressed equivalent of scx_bpf_cpu_curr(). Callers must hold RCU + * read lock (KF_RCU). + */ +__bpf_kfunc struct task_struct *scx_bpf_cid_curr(s32 cid, const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + s32 cpu; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return NULL; + cpu = scx_cid_to_cpu(sch, cid); + if (cpu < 0) + return NULL; + return rcu_dereference(cpu_rq(cpu)->curr); +} + +/** + * scx_bpf_tid_to_task - Look up a task by its scx tid + * @tid: task ID previously read from p->scx.tid + * + * Returns the task with the given tid, or NULL if no such task exists. The + * returned pointer is valid until the end of the current RCU read section + * (KF_RCU_PROTECTED). Requires SCX_OPS_TID_TO_TASK to be set on the root + * scheduler; otherwise an error is raised and NULL returned. + */ +__bpf_kfunc struct task_struct *scx_bpf_tid_to_task(u64 tid) +{ + struct sched_ext_entity *scx; + + if (!scx_tid_to_task_enabled()) { + struct scx_sched *sch = rcu_dereference(scx_root); + + if (sch) + scx_error(sch, "scx_bpf_tid_to_task() called without SCX_OPS_TID_TO_TASK"); + return NULL; + } + + scx = rhashtable_lookup(&scx_tid_hash, &tid, scx_tid_hash_params); + if (!scx) + return NULL; + + return container_of(scx, struct task_struct, scx); +} + +/** * scx_bpf_now - Returns a high-performance monotonically non-decreasing * clock for the current CPU. The clock returned is in nanoseconds. * @@ -9601,6 +10511,7 @@ BTF_KFUNCS_START(scx_kfunc_ids_any) BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_kick_cid, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_destroy_dsq, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) @@ -9615,16 +10526,25 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cidperf_cap, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cidperf_cur, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cidperf_set, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) +BTF_ID_FLAGS(func, scx_bpf_nr_cids) +BTF_ID_FLAGS(func, scx_bpf_nr_online_cids) +BTF_ID_FLAGS(func, scx_bpf_this_cid) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_task_cid, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL) BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, scx_bpf_cid_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, scx_bpf_tid_to_task, KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, scx_bpf_now) BTF_ID_FLAGS(func, scx_bpf_events) #ifdef CONFIG_CGROUP_SCHED @@ -9639,6 +10559,47 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = { }; /* + * cpu-form kfuncs that are forbidden from cid-form schedulers + * (bpf_sched_ext_ops_cid). Programs targeting the cid struct_ops type must + * use the cid-form alternative (cid/cmask kfuncs). + * + * Membership overlaps with scx_kfunc_ids_{any,idle,select_cpu}; the filter + * tests this set independently and rejects matches before the per-op + * allow-list check runs. + * + * pahole/resolve_btfids scans every BTF_ID_FLAGS() at build time and + * intersects flags across duplicate entries, so each entry must carry the + * same flags as the kfunc's primary declaration; otherwise the flags get + * dropped globally. + */ +BTF_KFUNCS_START(scx_kfunc_ids_cpu_only) +BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, scx_bpf_cpu_node, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_cpu_only) + +/* * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc * group; an op may permit zero or more groups, with the union expressed in * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter()) @@ -9647,10 +10608,11 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = { */ enum scx_kf_allow_flags { SCX_KF_ALLOW_UNLOCKED = 1 << 0, - SCX_KF_ALLOW_CPU_RELEASE = 1 << 1, - SCX_KF_ALLOW_DISPATCH = 1 << 2, - SCX_KF_ALLOW_ENQUEUE = 1 << 3, - SCX_KF_ALLOW_SELECT_CPU = 1 << 4, + SCX_KF_ALLOW_INIT = 1 << 1, + SCX_KF_ALLOW_CPU_RELEASE = 1 << 2, + SCX_KF_ALLOW_DISPATCH = 1 << 3, + SCX_KF_ALLOW_ENQUEUE = 1 << 4, + SCX_KF_ALLOW_SELECT_CPU = 1 << 5, }; /* @@ -9678,7 +10640,7 @@ static const u32 scx_kf_allow_flags[] = { [SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED, [SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED, [SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED, - [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED | SCX_KF_ALLOW_INIT, [SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED, }; @@ -9693,16 +10655,18 @@ static const u32 scx_kf_allow_flags[] = { int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) { bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id); + bool in_init = btf_id_set8_contains(&scx_kfunc_ids_init, kfunc_id); bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id); bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); bool in_idle = btf_id_set8_contains(&scx_kfunc_ids_idle, kfunc_id); bool in_any = btf_id_set8_contains(&scx_kfunc_ids_any, kfunc_id); + bool in_cpu_only = btf_id_set8_contains(&scx_kfunc_ids_cpu_only, kfunc_id); u32 moff, flags; /* Not an SCX kfunc - allow. */ - if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || + if (!(in_unlocked || in_init || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release || in_idle || in_any)) return 0; @@ -9725,8 +10689,24 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) /* * Non-SCX struct_ops: SCX kfuncs are not permitted. + * + * Both bpf_sched_ext_ops (cpu-form) and bpf_sched_ext_ops_cid + * (cid-form) are valid SCX struct_ops. Member offsets match between + * the two (verified by BUILD_BUG_ON in scx_init()), so the shared + * scx_kf_allow_flags[] table indexed by SCX_MOFF_IDX(moff) applies to + * both. + */ + if (prog->aux->st_ops != &bpf_sched_ext_ops && + prog->aux->st_ops != &bpf_sched_ext_ops_cid) + return -EACCES; + + /* + * cid-form schedulers must use cid/cmask kfuncs. cid and cpu are both + * small s32s and trivially confused, so cpu-only kfuncs are rejected at + * load time. The reverse (cpu-form calling cid-form kfuncs) is + * intentionally permissive to ease gradual cpumask -> cid migration. */ - if (prog->aux->st_ops != &bpf_sched_ext_ops) + if (prog->aux->st_ops == &bpf_sched_ext_ops_cid && in_cpu_only) return -EACCES; /* SCX struct_ops: check the per-op allow list. */ @@ -9738,6 +10718,8 @@ int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked) return 0; + if ((flags & SCX_KF_ALLOW_INIT) && in_init) + return 0; if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release) return 0; if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch) @@ -9755,6 +10737,73 @@ static int __init scx_init(void) int ret; /* + * sched_ext_ops_cid mirrors sched_ext_ops up to and including @priv. + * Both bpf_scx_init_member() and bpf_scx_check_member() use offsets + * from struct sched_ext_ops; sched_ext_ops_cid relies on those offsets + * matching for the shared fields. Catch any drift at boot. + */ +#define CID_OFFSET_MATCH(cpu_field, cid_field) \ + BUILD_BUG_ON(offsetof(struct sched_ext_ops, cpu_field) != \ + offsetof(struct sched_ext_ops_cid, cid_field)) + /* data fields used by bpf_scx_init_member() */ + CID_OFFSET_MATCH(dispatch_max_batch, dispatch_max_batch); + CID_OFFSET_MATCH(flags, flags); + CID_OFFSET_MATCH(name, name); + CID_OFFSET_MATCH(timeout_ms, timeout_ms); + CID_OFFSET_MATCH(exit_dump_len, exit_dump_len); + CID_OFFSET_MATCH(hotplug_seq, hotplug_seq); + CID_OFFSET_MATCH(sub_cgroup_id, sub_cgroup_id); + /* shared callbacks: the union view requires byte-for-byte offset match */ + CID_OFFSET_MATCH(enqueue, enqueue); + CID_OFFSET_MATCH(dequeue, dequeue); + CID_OFFSET_MATCH(dispatch, dispatch); + CID_OFFSET_MATCH(tick, tick); + CID_OFFSET_MATCH(runnable, runnable); + CID_OFFSET_MATCH(running, running); + CID_OFFSET_MATCH(stopping, stopping); + CID_OFFSET_MATCH(quiescent, quiescent); + CID_OFFSET_MATCH(yield, yield); + CID_OFFSET_MATCH(core_sched_before, core_sched_before); + CID_OFFSET_MATCH(set_weight, set_weight); + CID_OFFSET_MATCH(update_idle, update_idle); + CID_OFFSET_MATCH(init_task, init_task); + CID_OFFSET_MATCH(exit_task, exit_task); + CID_OFFSET_MATCH(enable, enable); + CID_OFFSET_MATCH(disable, disable); + CID_OFFSET_MATCH(dump, dump); + CID_OFFSET_MATCH(dump_task, dump_task); + CID_OFFSET_MATCH(sub_attach, sub_attach); + CID_OFFSET_MATCH(sub_detach, sub_detach); + CID_OFFSET_MATCH(init, init); + CID_OFFSET_MATCH(exit, exit); +#ifdef CONFIG_EXT_GROUP_SCHED + CID_OFFSET_MATCH(cgroup_init, cgroup_init); + CID_OFFSET_MATCH(cgroup_exit, cgroup_exit); + CID_OFFSET_MATCH(cgroup_prep_move, cgroup_prep_move); + CID_OFFSET_MATCH(cgroup_move, cgroup_move); + CID_OFFSET_MATCH(cgroup_cancel_move, cgroup_cancel_move); + CID_OFFSET_MATCH(cgroup_set_weight, cgroup_set_weight); + CID_OFFSET_MATCH(cgroup_set_bandwidth, cgroup_set_bandwidth); + CID_OFFSET_MATCH(cgroup_set_idle, cgroup_set_idle); +#endif + /* renamed callbacks must occupy the same slot as their cpu-form sibling */ + CID_OFFSET_MATCH(select_cpu, select_cid); + CID_OFFSET_MATCH(set_cpumask, set_cmask); + CID_OFFSET_MATCH(cpu_online, cid_online); + CID_OFFSET_MATCH(cpu_offline, cid_offline); + CID_OFFSET_MATCH(dump_cpu, dump_cid); + /* @priv tail must align since both share the same data block */ + CID_OFFSET_MATCH(priv, priv); + /* + * cid-form must end exactly at @priv - validate_ops() skips + * cpu_acquire/cpu_release for cid-form because reading those fields + * past the BPF allocation would be UB. + */ + BUILD_BUG_ON(offsetof(struct sched_ext_ops_cid, __end) != + offsetofend(struct sched_ext_ops, priv)); +#undef CID_OFFSET_MATCH + + /* * kfunc registration can't be done from init_sched_ext_class() as * register_btf_kfunc_id_set() needs most of the system to be up. * @@ -9792,12 +10841,24 @@ static int __init scx_init(void) return ret; } + ret = scx_cid_kfunc_init(); + if (ret) { + pr_err("sched_ext: Failed to register cid kfuncs (%d)\n", ret); + return ret; + } + ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); if (ret) { pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); return ret; } + ret = register_bpf_struct_ops(&bpf_sched_ext_ops_cid, sched_ext_ops_cid); + if (ret) { + pr_err("sched_ext: Failed to register cid struct_ops (%d)\n", ret); + return ret; + } + ret = register_pm_notifier(&scx_pm_notifier); if (ret) { pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); diff --git a/kernel/sched/ext_arena.c b/kernel/sched/ext_arena.c new file mode 100644 index 000000000000..493c2424f842 --- /dev/null +++ b/kernel/sched/ext_arena.c @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * scx_arena_pool: kernel-side sub-allocator over BPF-arena pages. + * + * Each chunk added to @sch->arena_pool comes from one + * bpf_arena_alloc_pages_sleepable() call and is registered at the + * kernel-side mapping address. Callers translate to the BPF-arena form + * themselves if needed. + * + * Allocations grow the pool on demand. Underlying arena pages are released + * when the arena map itself is torn down. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Tejun Heo <tj@kernel.org> + */ + +enum scx_arena_consts { + SCX_ARENA_MIN_ORDER = 3, /* 8-byte minimum sub-allocation */ + SCX_ARENA_GROW_PAGES = 4, /* per growth */ +}; + +s32 scx_arena_pool_init(struct scx_sched *sch) +{ + if (!sch->arena_map) + return 0; + + sch->arena_pool = gen_pool_create(SCX_ARENA_MIN_ORDER, NUMA_NO_NODE); + if (!sch->arena_pool) + return -ENOMEM; + return 0; +} + +static void scx_arena_clear_chunk(struct gen_pool *pool, struct gen_pool_chunk *chunk, + void *data) +{ + int order = pool->min_alloc_order; + size_t chunk_sz = chunk->end_addr - chunk->start_addr + 1; + unsigned long end_bit = chunk_sz >> order; + unsigned long b, e; + + for_each_set_bitrange(b, e, chunk->bits, end_bit) + gen_pool_free(pool, chunk->start_addr + (b << order), + (e - b) << order); +} + +/* + * Tear down the pool. Outstanding gen_pool allocations are freed via + * scx_arena_clear_chunk() so gen_pool_destroy() doesn't BUG. The underlying + * arena pages are released when the arena map itself is torn down. + */ +void scx_arena_pool_destroy(struct scx_sched *sch) +{ + if (!sch->arena_pool) + return; + gen_pool_for_each_chunk(sch->arena_pool, scx_arena_clear_chunk, NULL); + gen_pool_destroy(sch->arena_pool); + sch->arena_pool = NULL; +} + +/* + * Grow the pool by @page_cnt pages. bpf_arena_alloc_pages_sleepable() and + * gen_pool_add() (which calls vzalloc(GFP_KERNEL)) require a sleepable + * context. + */ +static int scx_arena_grow(struct scx_sched *sch, u32 page_cnt) +{ + u64 kern_vm_start; + u32 uaddr32; + void *p; + int ret; + + if (!sch->arena_map || !sch->arena_pool) + return -EINVAL; + + p = bpf_arena_alloc_pages_sleepable(sch->arena_map, NULL, + page_cnt, NUMA_NO_NODE, 0); + if (!p) + return -ENOMEM; + + uaddr32 = (u32)(unsigned long)p; + /* arena.o, which defines these, is built only on MMU && 64BIT */ +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT) + kern_vm_start = bpf_arena_map_kern_vm_start(sch->arena_map); +#else + kern_vm_start = 0; +#endif + + ret = gen_pool_add(sch->arena_pool, kern_vm_start + uaddr32, + page_cnt * PAGE_SIZE, NUMA_NO_NODE); + if (ret) { + bpf_arena_free_pages_non_sleepable(sch->arena_map, p, page_cnt); + return ret; + } + return 0; +} + +/* + * Allocate @size bytes from the arena pool. Returns kernel VA on success, NULL + * on failure. May grow the pool via scx_arena_grow() which sleeps. Caller must + * be in a GFP_KERNEL context. + */ +void *scx_arena_alloc(struct scx_sched *sch, size_t size) +{ + unsigned long kern_va; + u32 page_cnt; + + might_sleep(); + + if (!sch->arena_pool) + return NULL; + + while (true) { + kern_va = gen_pool_alloc(sch->arena_pool, size); + if (kern_va) + break; + page_cnt = max_t(u32, SCX_ARENA_GROW_PAGES, + (size + PAGE_SIZE - 1) >> PAGE_SHIFT); + if (scx_arena_grow(sch, page_cnt)) + return NULL; + } + + return (void *)kern_va; +} + +void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size) +{ + if (sch->arena_pool && kern_va) + gen_pool_free(sch->arena_pool, (unsigned long)kern_va, size); +} diff --git a/kernel/sched/ext_arena.h b/kernel/sched/ext_arena.h new file mode 100644 index 000000000000..4f3610160102 --- /dev/null +++ b/kernel/sched/ext_arena.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> + */ +#ifndef _KERNEL_SCHED_EXT_ARENA_H +#define _KERNEL_SCHED_EXT_ARENA_H + +struct scx_sched; + +s32 scx_arena_pool_init(struct scx_sched *sch); +void scx_arena_pool_destroy(struct scx_sched *sch); +void *scx_arena_alloc(struct scx_sched *sch, size_t size); +void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size); + +#endif /* _KERNEL_SCHED_EXT_ARENA_H */ diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c new file mode 100644 index 000000000000..66944a7ef79d --- /dev/null +++ b/kernel/sched/ext_cid.c @@ -0,0 +1,707 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Tejun Heo <tj@kernel.org> + */ +#include <linux/cacheinfo.h> + +/* + * cid tables. + * + * Pointers are published once on first enable and never revoked. The default + * mapping is populated before ops.init() runs; scx_bpf_cid_override() commits + * before it returns. As long as the BPF scheduler only uses the tables from + * those points onward, it sees a consistent view. + */ +s16 *scx_cid_to_cpu_tbl; +s16 *scx_cpu_to_cid_tbl; +struct scx_cid_topo *scx_cid_topo; + +#define SCX_CID_TOPO_NEG (struct scx_cid_topo) { \ + .core_cid = -1, .core_idx = -1, .llc_cid = -1, .llc_idx = -1, \ + .node_cid = -1, .node_idx = -1, \ +} + +/* + * Return @cpu's LLC shared_cpu_map. If cacheinfo isn't populated (offline or + * !present), record @cpu in @fallbacks and return its node mask instead - the + * worst that can happen is that the cpu's LLC becomes coarser than reality. + */ +static const struct cpumask *cpu_llc_mask(int cpu, struct cpumask *fallbacks) +{ + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); + + if (!ci || !ci->info_list || !ci->num_leaves) { + cpumask_set_cpu(cpu, fallbacks); + return cpumask_of_node(cpu_to_node(cpu)); + } + return &ci->info_list[ci->num_leaves - 1].shared_cpu_map; +} + +/* Allocate the cid tables once on first enable; never freed. */ +static s32 scx_cid_arrays_alloc(void) +{ + u32 npossible = num_possible_cpus(); + s16 *cid_to_cpu, *cpu_to_cid; + struct scx_cid_topo *cid_topo; + + if (scx_cid_to_cpu_tbl) + return 0; + + cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL); + cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL); + cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL); + + if (!cid_to_cpu || !cpu_to_cid || !cid_topo) { + kfree(cid_to_cpu); + kfree(cpu_to_cid); + kfree(cid_topo); + return -ENOMEM; + } + + WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu); + WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid); + WRITE_ONCE(scx_cid_topo, cid_topo); + return 0; +} + +/** + * scx_cid_init - build the cid mapping + * @sch: the scx_sched being initialized; used as the scx_error() target + * + * See "Topological CPU IDs" in ext_cid.h for the model. Walk online cpus by + * intersection at each level (parent_scratch & this_level_mask), which keeps + * containment correct by construction and naturally splits a physical LLC + * straddling two NUMA nodes into two LLC units. The caller must hold + * cpus_read_lock. + */ +s32 scx_cid_init(struct scx_sched *sch) +{ + cpumask_var_t to_walk __free(free_cpumask_var) = CPUMASK_VAR_NULL; + cpumask_var_t node_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL; + cpumask_var_t llc_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL; + cpumask_var_t core_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL; + cpumask_var_t llc_fallback __free(free_cpumask_var) = CPUMASK_VAR_NULL; + cpumask_var_t online_no_topo __free(free_cpumask_var) = CPUMASK_VAR_NULL; + u32 next_cid = 0; + s32 next_node_idx = 0, next_llc_idx = 0, next_core_idx = 0; + s32 cpu, ret; + + /* CMASK_MAX_WORDS in cid.bpf.h covers NR_CPUS up to 8192 */ + BUILD_BUG_ON(NR_CPUS > 8192); + + lockdep_assert_cpus_held(); + + ret = scx_cid_arrays_alloc(); + if (ret) + return ret; + + if (!zalloc_cpumask_var(&to_walk, GFP_KERNEL) || + !zalloc_cpumask_var(&node_scratch, GFP_KERNEL) || + !zalloc_cpumask_var(&llc_scratch, GFP_KERNEL) || + !zalloc_cpumask_var(&core_scratch, GFP_KERNEL) || + !zalloc_cpumask_var(&llc_fallback, GFP_KERNEL) || + !zalloc_cpumask_var(&online_no_topo, GFP_KERNEL)) + return -ENOMEM; + + /* -1 sentinels for sparse-possible cpu id holes (0 is a valid cid) */ + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + scx_cpu_to_cid_tbl[cpu] = -1; + + cpumask_copy(to_walk, cpu_online_mask); + + while (!cpumask_empty(to_walk)) { + s32 next_cpu = cpumask_first(to_walk); + s32 nid = cpu_to_node(next_cpu); + s32 node_cid = next_cid; + s32 node_idx; + + /* + * No NUMA info: skip and let the tail loop assign a no-topo + * cid. cpumask_of_node(-1) is undefined. + */ + if (nid < 0) { + cpumask_clear_cpu(next_cpu, to_walk); + continue; + } + + node_idx = next_node_idx++; + + /* node_scratch = to_walk & this node */ + cpumask_and(node_scratch, to_walk, cpumask_of_node(nid)); + if (WARN_ON_ONCE(!cpumask_test_cpu(next_cpu, node_scratch))) + return -EINVAL; + + while (!cpumask_empty(node_scratch)) { + s32 ncpu = cpumask_first(node_scratch); + const struct cpumask *llc_mask = cpu_llc_mask(ncpu, llc_fallback); + s32 llc_cid = next_cid; + s32 llc_idx = next_llc_idx++; + + /* llc_scratch = node_scratch & this llc */ + cpumask_and(llc_scratch, node_scratch, llc_mask); + if (WARN_ON_ONCE(!cpumask_test_cpu(ncpu, llc_scratch))) + return -EINVAL; + + while (!cpumask_empty(llc_scratch)) { + s32 lcpu = cpumask_first(llc_scratch); + const struct cpumask *sib = topology_sibling_cpumask(lcpu); + s32 core_cid = next_cid; + s32 core_idx = next_core_idx++; + s32 ccpu; + + /* core_scratch = llc_scratch & this core */ + cpumask_and(core_scratch, llc_scratch, sib); + if (WARN_ON_ONCE(!cpumask_test_cpu(lcpu, core_scratch))) + return -EINVAL; + + for_each_cpu(ccpu, core_scratch) { + s32 cid = next_cid++; + + scx_cid_to_cpu_tbl[cid] = ccpu; + scx_cpu_to_cid_tbl[ccpu] = cid; + scx_cid_topo[cid] = (struct scx_cid_topo){ + .core_cid = core_cid, + .core_idx = core_idx, + .llc_cid = llc_cid, + .llc_idx = llc_idx, + .node_cid = node_cid, + .node_idx = node_idx, + }; + + cpumask_clear_cpu(ccpu, llc_scratch); + cpumask_clear_cpu(ccpu, node_scratch); + cpumask_clear_cpu(ccpu, to_walk); + } + } + } + } + + /* + * No-topo section: any possible cpu without a cid - normally just the + * not-online ones. Collect any currently-online cpus that land here in + * @online_no_topo so we can warn about them at the end. + */ + for_each_cpu(cpu, cpu_possible_mask) { + s32 cid; + + if (__scx_cpu_to_cid(cpu) != -1) + continue; + if (cpu_online(cpu)) + cpumask_set_cpu(cpu, online_no_topo); + + cid = next_cid++; + scx_cid_to_cpu_tbl[cid] = cpu; + scx_cpu_to_cid_tbl[cpu] = cid; + scx_cid_topo[cid] = SCX_CID_TOPO_NEG; + } + + if (!cpumask_empty(llc_fallback)) + pr_warn("scx_cid: cpus without cacheinfo, using node mask as llc: %*pbl\n", + cpumask_pr_args(llc_fallback)); + if (!cpumask_empty(online_no_topo)) + pr_warn("scx_cid: online cpus with no usable topology: %*pbl\n", + cpumask_pr_args(online_no_topo)); + + return 0; +} + +/** + * scx_cmask_clear - Zero every bit in @m's active range + * @m: cmask to clear + * + * Storage past the active range is left as is. + */ +void scx_cmask_clear(struct scx_cmask *m) +{ + u32 nr_words; + + if (!m->nr_cids) + return; + nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1; + memset(m->bits, 0, nr_words * sizeof(u64)); +} + +/** + * scx_cmask_fill - Set every bit in @m's active range + * @m: cmask to fill + * + * Counterpart to scx_cmask_clear(). Storage past the active range is left as is. + */ +void scx_cmask_fill(struct scx_cmask *m) +{ + u32 nr_words, head_bits, tail_bits; + + if (!m->nr_cids) + return; + nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1; + memset(m->bits, 0xff, nr_words * sizeof(u64)); + + /* clear word-0 bits below base */ + head_bits = m->base & 63; + if (head_bits) + m->bits[0] &= ~((1ULL << head_bits) - 1); + + /* clear last-word bits at or past base + nr_cids */ + tail_bits = (m->base + m->nr_cids) & 63; + if (tail_bits) + m->bits[nr_words - 1] &= (1ULL << tail_bits) - 1; +} + +/** + * scx_cpumask_to_cmask - Translate a kernel cpumask into a cmask + * @src: source cpumask + * @dst: cmask to write + * + * Clear @dst's active range and set the bit for each cid whose cpu is in + * @src and lies within that range. Out-of-range cids are silently ignored. + */ +void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst) +{ + s32 cpu; + + scx_cmask_clear(dst); + for_each_cpu(cpu, src) { + s32 cid = __scx_cpu_to_cid(cpu); + + if (cid >= 0) + __scx_cmask_set(cid, dst); + } +} + +__bpf_kfunc_start_defs(); + +/** + * scx_bpf_cid_override - Install an explicit cpu->cid mapping + * @cpu_to_cid: array of nr_cpu_ids s32 entries (cid for each cpu) + * @cpu_to_cid__sz: must be nr_cpu_ids * sizeof(s32) bytes + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * May only be called from ops.init() of the root scheduler. Replace the + * topology-probed cid mapping with the caller-provided one. Each possible cpu + * must map to a unique cid in [0, num_possible_cpus()). Topo info is cleared. + * On invalid input, trigger scx_error() to abort the scheduler. + */ +__bpf_kfunc void scx_bpf_cid_override(const s32 *cpu_to_cid, u32 cpu_to_cid__sz, + const struct bpf_prog_aux *aux) +{ + cpumask_var_t seen __free(free_cpumask_var) = CPUMASK_VAR_NULL; + struct scx_sched *sch; + bool alloced; + s32 cpu, cid; + + /* GFP_KERNEL alloc must happen before the rcu read section */ + alloced = zalloc_cpumask_var(&seen, GFP_KERNEL); + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return; + + if (!alloced) { + scx_error(sch, "scx_bpf_cid_override: failed to allocate cpumask"); + return; + } + + if (scx_parent(sch)) { + scx_error(sch, "scx_bpf_cid_override() only allowed from root sched"); + return; + } + + if (cpu_to_cid__sz != nr_cpu_ids * sizeof(s32)) { + scx_error(sch, "scx_bpf_cid_override: expected %zu bytes, got %u", + nr_cpu_ids * sizeof(s32), cpu_to_cid__sz); + return; + } + + for_each_possible_cpu(cpu) { + s32 c = cpu_to_cid[cpu]; + + if (!cid_valid(sch, c)) + return; + if (cpumask_test_and_set_cpu(c, seen)) { + scx_error(sch, "cid %d assigned to multiple cpus", c); + return; + } + scx_cpu_to_cid_tbl[cpu] = c; + scx_cid_to_cpu_tbl[c] = cpu; + } + + /* Invalidate stale topo info - the override carries no topology. */ + for (cid = 0; cid < num_possible_cpus(); cid++) + scx_cid_topo[cid] = SCX_CID_TOPO_NEG; +} + +/** + * scx_bpf_cid_to_cpu - Return the raw CPU id for @cid + * @cid: cid to look up + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * Return the raw CPU id for @cid. Trigger scx_error() and return -EINVAL if + * @cid is invalid. The cid<->cpu mapping is static for the lifetime of the + * loaded scheduler, so the BPF side can cache the result to avoid repeated + * kfunc invocations. + */ +__bpf_kfunc s32 scx_bpf_cid_to_cpu(s32 cid, const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return -EINVAL; + return scx_cid_to_cpu(sch, cid); +} + +/** + * scx_bpf_cpu_to_cid - Return the cid for @cpu + * @cpu: cpu to look up + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * Return the cid for @cpu. Trigger scx_error() and return -EINVAL if @cpu is + * invalid. The cid<->cpu mapping is static for the lifetime of the loaded + * scheduler, so the BPF side can cache the result to avoid repeated kfunc + * invocations. + */ +__bpf_kfunc s32 scx_bpf_cpu_to_cid(s32 cpu, const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return -EINVAL; + return scx_cpu_to_cid(sch, cpu); +} + +/* + * Set ops on cmasks. cmask_walk_op2() shares one walk across mutating + * (and/or/copy/andnot) and predicate (subset/intersects) two-cmask forms; + * cmask_walk_op1() does the same shape over a single cmask range. Every public + * entry passes a compile-time-constant @op; cmask_walk_op{1,2}() and + * cmask_word_op{1,2}() are __always_inline so the inner switch collapses to the + * selected op and cmask_op2_is_pred() folds the predicate early-exit out of + * mutating ops. + * + * Two-cmask ops only touch @dst bits inside the intersection of the two ranges; + * bits outside stay untouched. In particular, scx_cmask_copy() does NOT zero + * @dst bits that lie outside @src's range. + * + * The _RACY variants are otherwise identical to their non-racy counterpart but + * read @src word-by-word via data_race(). Memory ordering with concurrent + * writers is the caller's responsibility. + */ +enum cmask_op2 { + /* mutating */ + CMASK_OP2_AND, + CMASK_OP2_OR, + CMASK_OP2_OR_RACY, + CMASK_OP2_COPY, + CMASK_OP2_COPY_RACY, + CMASK_OP2_ANDNOT, + /* predicates - short-circuit when the per-word result is true */ + CMASK_OP2_SUBSET, + CMASK_OP2_INTERSECTS, +}; + +static __always_inline bool cmask_op2_is_pred(const enum cmask_op2 op) +{ + return op == CMASK_OP2_SUBSET || op == CMASK_OP2_INTERSECTS; +} + +static __always_inline bool cmask_word_op2(u64 *av, const u64 *bp, u64 mask, + const enum cmask_op2 op) +{ + switch (op) { + case CMASK_OP2_AND: + *av &= ~mask | *bp; + return false; + case CMASK_OP2_OR: + *av |= *bp & mask; + return false; + case CMASK_OP2_OR_RACY: + *av |= data_race(*bp) & mask; + return false; + case CMASK_OP2_COPY: + *av = (*av & ~mask) | (*bp & mask); + return false; + case CMASK_OP2_COPY_RACY: + *av = (*av & ~mask) | (data_race(*bp) & mask); + return false; + case CMASK_OP2_ANDNOT: + *av &= ~(*bp & mask); + return false; + case CMASK_OP2_SUBSET: + /* stop on the first bit in @sub not set in @super */ + return (*bp & ~*av) & mask; + case CMASK_OP2_INTERSECTS: + return (*av & *bp) & mask; + } + unreachable(); +} + +/* + * Walk the intersection of [@a_base, @a_base + @a_nr_cids) with [@b_base, + * @b_base + @b_nr_cids) word by word, applying @op. Mutating ops walk all words + * and return false; predicates return true on the first word whose per-word + * test is true. Empty intersection returns false (matches "no bits to consider" + * for both mutate and predicate). + * + * Base/nr_cids are taken as parameters so callers with snapshotted bounds can + * drive the walk with values independent of the cmask's header. + */ +static __always_inline bool cmask_walk_op2(u64 *a_bits, u32 a_base, u32 a_nr_cids, + const u64 *b_bits, u32 b_base, u32 b_nr_cids, + const enum cmask_op2 op) +{ + u32 lo = max(a_base, b_base); + u32 hi = min(a_base + a_nr_cids, b_base + b_nr_cids); + u32 a_word_off = a_base / 64; + u32 b_word_off = b_base / 64; + u32 lo_word = lo / 64; + u32 hi_word = (hi - 1) / 64; + u64 head_mask = GENMASK_U64(63, lo & 63); + u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0); + u32 w; + + if (lo >= hi) + return false; + + if (lo_word == hi_word) + return cmask_word_op2(&a_bits[lo_word - a_word_off], + &b_bits[lo_word - b_word_off], + head_mask & tail_mask, op); + + if (cmask_word_op2(&a_bits[lo_word - a_word_off], + &b_bits[lo_word - b_word_off], head_mask, op) && + cmask_op2_is_pred(op)) + return true; + + for (w = lo_word + 1; w < hi_word; w++) + if (cmask_word_op2(&a_bits[w - a_word_off], + &b_bits[w - b_word_off], ~0ULL, op) && + cmask_op2_is_pred(op)) + return true; + + return cmask_word_op2(&a_bits[hi_word - a_word_off], + &b_bits[hi_word - b_word_off], tail_mask, op); +} + +enum cmask_op1 { + CMASK_OP1_ANY_SET, +}; + +static __always_inline bool cmask_word_op1(const u64 *ap, u64 mask, + const enum cmask_op1 op) +{ + switch (op) { + case CMASK_OP1_ANY_SET: + return *ap & mask; + } + unreachable(); +} + +/* + * Walk [@a_base, @a_base + @a_nr_cids) of @a_bits word by word, applying @op. + * Returns true on the first word whose per-word test is true; returns false if + * no word matches or the range is empty. All current op1s short-circuit on + * per-word true; if a non-predicate op1 lands here, add a cmask_op1_is_pred() + * guard analogous to cmask_op2_is_pred(). + */ +static __always_inline bool cmask_walk_op1(const u64 *a_bits, u32 a_base, + u32 a_nr_cids, + const enum cmask_op1 op) +{ + u32 lo = a_base; + u32 hi = a_base + a_nr_cids; + u32 a_word_off = a_base / 64; + u32 lo_word = lo / 64; + u32 hi_word = (hi - 1) / 64; + u64 head_mask = GENMASK_U64(63, lo & 63); + u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0); + u32 w; + + if (lo >= hi) + return false; + + if (lo_word == hi_word) + return cmask_word_op1(&a_bits[lo_word - a_word_off], + head_mask & tail_mask, op); + + if (cmask_word_op1(&a_bits[lo_word - a_word_off], head_mask, op)) + return true; + for (w = lo_word + 1; w < hi_word; w++) + if (cmask_word_op1(&a_bits[w - a_word_off], ~0ULL, op)) + return true; + return cmask_word_op1(&a_bits[hi_word - a_word_off], tail_mask, op); +} + +void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src) +{ + cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, + src->bits, src->base, src->nr_cids, CMASK_OP2_AND); +} + +void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src) +{ + cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, + src->bits, src->base, src->nr_cids, CMASK_OP2_OR); +} + +/** + * scx_cmask_or_racy - OR @src into @dst, reading @src without locking + * + * @src is read word-by-word through data_race(). Same per-bit independence + * rationale as scx_cmask_copy_racy(). Memory ordering with writers is the + * caller's responsibility. + */ +void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src) +{ + cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, + src->bits, src->base, src->nr_cids, CMASK_OP2_OR_RACY); +} + +void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src) +{ + cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, + src->bits, src->base, src->nr_cids, CMASK_OP2_COPY); +} + +/** + * scx_cmask_copy_racy - Snapshot @src into @dst without locking + * + * @src is read word-by-word through data_race(). Head/tail masking matches + * scx_cmask_copy(). Each bit in a cmask is independent, so partial updates + * just leave some bits fresher than others. Memory ordering with writers is + * the caller's responsibility. + */ +void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src) +{ + cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, + src->bits, src->base, src->nr_cids, CMASK_OP2_COPY_RACY); +} + +void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src) +{ + cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, + src->bits, src->base, src->nr_cids, CMASK_OP2_ANDNOT); +} + +/* + * Return true if @cm has any bit set in [@lo, @hi). Caller must ensure + * [@lo, @hi) is contained in @cm's range. + */ +static bool cmask_any_set_in_range(const struct scx_cmask *cm, u32 lo, u32 hi) +{ + if (lo >= hi) + return false; + return cmask_walk_op1(&cm->bits[lo / 64 - cm->base / 64], lo, hi - lo, + CMASK_OP1_ANY_SET); +} + +/** + * scx_cmask_subset - test whether @sub is a subset of @super + * @sub: cmask to test + * @super: cmask to test against + * + * Return true iff every set bit of @sub is also set in @super. + */ +bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super) +{ + u32 super_end = super->base + super->nr_cids; + u32 sub_end = sub->base + sub->nr_cids; + + /* + * Set bits in @sub outside @super's range can't be in @super, so any + * such bit means not a subset. The walk below only visits words + * common to both ranges, so these need a separate scan. + */ + if (sub->base < super->base && + cmask_any_set_in_range(sub, sub->base, min(super->base, sub_end))) + return false; + if (sub_end > super_end && + cmask_any_set_in_range(sub, max(sub->base, super_end), sub_end)) + return false; + + return !cmask_walk_op2((u64 *)super->bits, super->base, super->nr_cids, + sub->bits, sub->base, sub->nr_cids, CMASK_OP2_SUBSET); +} + +bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b) +{ + return cmask_walk_op2((u64 *)a->bits, a->base, a->nr_cids, + b->bits, b->base, b->nr_cids, CMASK_OP2_INTERSECTS); +} + +/** + * scx_cmask_empty - Test whether @m has no bits set + * @m: cmask to test + * + * Return true iff @m's active range has no bits set. + */ +bool scx_cmask_empty(const struct scx_cmask *m) +{ + return !cmask_any_set_in_range(m, m->base, m->base + m->nr_cids); +} + +/** + * scx_bpf_cid_topo - Copy out per-cid topology info + * @cid: cid to look up + * @out__uninit: where to copy the topology info; fully written by this call + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * Fill @out__uninit with the topology info for @cid. Trigger scx_error() if + * @cid is out of range. If @cid is valid but in the no-topo section, all fields + * are set to -1. + */ +__bpf_kfunc void scx_bpf_cid_topo(s32 cid, struct scx_cid_topo *out__uninit, + const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch) || !cid_valid(sch, cid)) { + *out__uninit = SCX_CID_TOPO_NEG; + return; + } + + *out__uninit = READ_ONCE(scx_cid_topo)[cid]; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_init) +BTF_ID_FLAGS(func, scx_bpf_cid_override, KF_IMPLICIT_ARGS | KF_SLEEPABLE) +BTF_KFUNCS_END(scx_kfunc_ids_init) + +static const struct btf_kfunc_id_set scx_kfunc_set_init = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_init, + .filter = scx_kfunc_context_filter, +}; + +BTF_KFUNCS_START(scx_kfunc_ids_cid) +BTF_ID_FLAGS(func, scx_bpf_cid_to_cpu, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpu_to_cid, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cid_topo, KF_IMPLICIT_ARGS) +BTF_KFUNCS_END(scx_kfunc_ids_cid) + +static const struct btf_kfunc_id_set scx_kfunc_set_cid = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_cid, +}; + +int scx_cid_kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_init) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_cid) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_cid) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_cid); +} diff --git a/kernel/sched/ext_cid.h b/kernel/sched/ext_cid.h new file mode 100644 index 000000000000..5745e5785e89 --- /dev/null +++ b/kernel/sched/ext_cid.h @@ -0,0 +1,271 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Topological CPU IDs (cids) + * -------------------------- + * + * Raw cpu numbers are clumsy for sharding work and communication across + * topology units, especially from BPF: the space can be sparse, numerical + * closeness doesn't imply topological closeness (x86 hyperthreading often puts + * SMT siblings far apart), and a range of cpu ids doesn't mean anything. + * Sub-scheds make this acute - cpu allocation, revocation and other state are + * constantly communicated across sub-scheds, and passing whole cpumasks scales + * poorly with cpu count. cpumasks are also awkward in BPF: a variable-length + * kernel type sized for the maximum NR_CPUS (4k), with verbose helper sequences + * for every op. + * + * cids give every cpu a dense, topology-ordered id. CPUs sharing a core, LLC or + * NUMA node get contiguous cid ranges, so a topology unit becomes a (start, + * length) slice of cid space. Communication can pass a slice instead of a + * cpumask, and BPF code can process, for example, a u64 word's worth of cids at + * a time. + * + * The mapping is built once at root scheduler enable time by walking the + * topology of online cpus only. Going by online cpus is out of necessity: + * depending on the arch, topology info isn't reliably available for offline + * cpus. The expected usage model is restarting the scheduler on hotplug events + * so the mapping is rebuilt against the new online set. A scheduler that wants + * to handle hotplug without a restart can provide its own cid and shard mapping + * through the override interface. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Tejun Heo <tj@kernel.org> + */ +#ifndef _KERNEL_SCHED_EXT_CID_H +#define _KERNEL_SCHED_EXT_CID_H + +struct scx_sched; + +/* + * Cid space (total is always num_possible_cpus()) is laid out with + * topology-annotated cids first, then no-topo cids at the tail. The + * topology-annotated block covers the cpus that were online when scx_cid_init() + * ran and remains valid even after those cpus go offline. The tail block covers + * possible-but-not-online cpus and carries all-(-1) topo info (see + * scx_cid_topo); callers detect it via the -1 sentinels. + * + * See the comment above the table definitions in ext_cid.c for the + * memory-ordering and visibility contract. + */ +extern s16 *scx_cid_to_cpu_tbl; +extern s16 *scx_cpu_to_cid_tbl; +extern struct scx_cid_topo *scx_cid_topo; +extern struct btf_id_set8 scx_kfunc_ids_init; + +void scx_cmask_clear(struct scx_cmask *m); +void scx_cmask_fill(struct scx_cmask *m); +void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src); +void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src); +void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src); +void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src); +void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src); +void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src); +bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super); +bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b); +bool scx_cmask_empty(const struct scx_cmask *m); +s32 scx_cid_init(struct scx_sched *sch); +int scx_cid_kfunc_init(void); +void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst); + +/** + * cid_valid - Verify a cid value, to be used on ops input args + * @sch: scx_sched to abort on error + * @cid: cid which came from a BPF ops + * + * Return true if @cid is in [0, num_possible_cpus()). On failure, trigger + * scx_error() and return false. + */ +static inline bool cid_valid(struct scx_sched *sch, s32 cid) +{ + if (likely(cid >= 0 && cid < num_possible_cpus())) + return true; + scx_error(sch, "invalid cid %d", cid); + return false; +} + +/** + * __scx_cid_to_cpu - Unchecked cid->cpu table lookup + * @cid: cid to look up. Must be in [0, num_possible_cpus()). + * + * Intended for callsites that have already validated @cid and that hold a + * non-NULL @sch from scx_prog_sched() - a live sched implies the table has + * been allocated, so no NULL check is needed here. + */ +static inline s32 __scx_cid_to_cpu(s32 cid) +{ + /* READ_ONCE pairs with WRITE_ONCE in scx_cid_arrays_alloc() */ + return READ_ONCE(scx_cid_to_cpu_tbl)[cid]; +} + +/** + * __scx_cpu_to_cid - Unchecked cpu->cid table lookup + * @cpu: cpu to look up. Must be a valid possible cpu id. + * + * Same usage constraints as __scx_cid_to_cpu(). + */ +static inline s32 __scx_cpu_to_cid(s32 cpu) +{ + return READ_ONCE(scx_cpu_to_cid_tbl)[cpu]; +} + +/** + * scx_cid_to_cpu - Translate @cid to its cpu + * @sch: scx_sched for error reporting + * @cid: cid to look up + * + * Return the cpu for @cid or a negative errno on failure. Invalid cid triggers + * scx_error() on @sch. The cid arrays are allocated on first scheduler enable + * and never freed, so the returned cpu is stable for the lifetime of the loaded + * scheduler. + */ +static inline s32 scx_cid_to_cpu(struct scx_sched *sch, s32 cid) +{ + if (!cid_valid(sch, cid)) + return -EINVAL; + return __scx_cid_to_cpu(cid); +} + +/** + * scx_cpu_to_cid - Translate @cpu to its cid + * @sch: scx_sched for error reporting + * @cpu: cpu to look up + * + * Return the cid for @cpu or a negative errno on failure. Invalid cpu triggers + * scx_error() on @sch. Same lifetime guarantee as scx_cid_to_cpu(). + */ +static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu) +{ + if (!scx_cpu_valid(sch, cpu, NULL)) + return -EINVAL; + return __scx_cpu_to_cid(cpu); +} + +/** + * scx_is_cid_type - Test whether the active scheduler hierarchy is cid-form + */ +static inline bool scx_is_cid_type(void) +{ + return static_branch_unlikely(&__scx_is_cid_type); +} + +static inline bool __scx_cmask_contains(u32 cid, const struct scx_cmask *m) +{ + return likely(cid >= m->base && cid < m->base + m->nr_cids); +} + +/* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */ +static inline u64 *__scx_cmask_word(u32 cid, const struct scx_cmask *m) +{ + return (u64 *)&m->bits[cid / 64 - m->base / 64]; +} + +/** + * __scx_cmask_init - Initialize @m with explicit storage capacity + * @m: cmask to initialize + * @base: first cid of the active range + * @nr_cids: number of cids in the active range + * @alloc_cids: storage capacity in cids, at least @nr_cids + * + * Use when storage is sized larger than the initial active range. All of + * bits[] is zeroed. + */ +static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids, + u32 alloc_cids) +{ + if (WARN_ON_ONCE(alloc_cids < nr_cids)) + nr_cids = alloc_cids; + + m->base = base; + m->nr_cids = nr_cids; + m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids); + memset(m->bits, 0, m->alloc_words * sizeof(u64)); +} + +/** + * scx_cmask_init - Initialize @m on tight storage + * @m: cmask to initialize + * @base: first cid of the active range + * @nr_cids: number of cids in the active range + * + * All of bits[] is zeroed. + */ +static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids) +{ + __scx_cmask_init(m, base, nr_cids, nr_cids); +} + +/** + * scx_cmask_reframe - Reshape @m's active range without resizing storage + * @m: cmask to reframe + * @base: new active range base + * @nr_cids: new active range length, must fit within @m->alloc_words + * + * Body bits within the new range become garbage - only the head and tail + * words are zeroed to keep the padding invariant. + */ +static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids) +{ + if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words)) + return; + + if (nr_cids) { + u32 last_word = ((base & 63) + nr_cids - 1) / 64; + + m->bits[0] = 0; + m->bits[last_word] = 0; + } + + m->base = base; + m->nr_cids = nr_cids; +} + +static inline void __scx_cmask_set(u32 cid, struct scx_cmask *m) +{ + if (!__scx_cmask_contains(cid, m)) + return; + *__scx_cmask_word(cid, m) |= BIT_U64(cid & 63); +} + +/** + * scx_cmask_test - test whether @cid is set in @m + * @cid: cid to test + * @m: cmask to test + * + * Return %false if @cid is outside @m's active range. Otherwise return the + * bit's value. Read via READ_ONCE so callers can race set/clear writers. + */ +static inline bool scx_cmask_test(u32 cid, const struct scx_cmask *m) +{ + if (!__scx_cmask_contains(cid, m)) + return false; + return READ_ONCE(*__scx_cmask_word(cid, m)) & BIT_U64(cid & 63); +} + +/* + * Words of bits[] the active range spans, 0 if empty. Tighter than the storage + * SCX_CMASK_NR_WORDS() sizes for the worst-case base alignment. + */ +static inline u32 scx_cmask_nr_used_words(const struct scx_cmask *m) +{ + if (!m->nr_cids) + return 0; + return ((m->base & 63) + m->nr_cids - 1) / 64 + 1; +} + +/** + * scx_cmask_for_each_cid - iterate set cids in @m + * @cid: s32 loop var that receives each set cid in turn + * @m: cmask to iterate + * + * Visits set bits within @m's active range in ascending order. Scans only the + * words the active range spans, where head and tail padding is kept zero, so + * no per-cid range check is needed. + */ +#define scx_cmask_for_each_cid(cid, m) \ + for (u64 __bs = (m)->base & ~63u, __wi = 0, \ + __nw = scx_cmask_nr_used_words(m); \ + __wi < __nw; __wi++) \ + for (u64 __w = READ_ONCE((m)->bits[__wi]); \ + __w && ((cid) = __bs + __wi * 64 + __ffs64(__w), true); \ + __w &= __w - 1) + +#endif /* _KERNEL_SCHED_EXT_CID_H */ diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 7468560a6d80..2077373d8da3 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -9,7 +9,6 @@ * Copyright (c) 2022 David Vernet <dvernet@meta.com> * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> */ -#include "ext_idle.h" /* Enable/disable built-in idle CPU selection policy */ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); @@ -79,7 +78,6 @@ static bool scx_idle_test_and_clear_cpu(int cpu) int node = scx_cpu_node_if_enabled(cpu); struct cpumask *idle_cpus = idle_cpumask(node)->cpu; -#ifdef CONFIG_SCHED_SMT /* * SMT mask should be cleared whether we can claim @cpu or not. The SMT * cluster is not wholly idle either way. This also prevents @@ -104,7 +102,6 @@ static bool scx_idle_test_and_clear_cpu(int cpu) else if (cpumask_test_cpu(cpu, idle_smts)) __cpumask_clear_cpu(cpu, idle_smts); } -#endif return cpumask_test_and_clear_cpu(cpu, idle_cpus); } @@ -466,12 +463,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, preempt_disable(); /* - * Check whether @prev_cpu is still within the allowed set. If not, - * we can still try selecting a nearby CPU. - */ - is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); - - /* * Determine the subset of CPUs usable by @p within @cpus_allowed. */ if (allowed != p->cpus_ptr) { @@ -488,6 +479,12 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, } /* + * Check whether @prev_cpu is still within the allowed set. If not, + * we can still try selecting a nearby CPU. + */ + is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); + + /* * This is necessary to protect llc_cpus. */ rcu_read_lock(); @@ -622,7 +619,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, goto out_unlock; } -#ifdef CONFIG_SCHED_SMT /* * Use @prev_cpu's sibling if it's idle. */ @@ -634,7 +630,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, goto out_unlock; } } -#endif /* * Search for any idle CPU in the same LLC domain. @@ -714,7 +709,6 @@ static void update_builtin_idle(int cpu, bool idle) assign_cpu(cpu, idle_cpus, idle); -#ifdef CONFIG_SCHED_SMT if (sched_smt_active()) { const struct cpumask *smt = cpu_smt_mask(cpu); struct cpumask *idle_smts = idle_cpumask(node)->smt; @@ -731,7 +725,6 @@ static void update_builtin_idle(int cpu, bool idle) cpumask_andnot(idle_smts, idle_smts, smt); } } -#endif } /* @@ -789,7 +782,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) */ if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_bypassing(sch, cpu_of(rq))) - SCX_CALL_OP(sch, update_idle, rq, cpu_of(rq), idle); + SCX_CALL_OP(sch, update_idle, rq, scx_cpu_arg(cpu_of(rq)), idle); } static void reset_idle_masks(struct sched_ext_ops *ops) @@ -917,7 +910,7 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, bool we_locked = false; s32 cpu; - if (!ops_cpu_valid(sch, prev_cpu, NULL)) + if (!scx_cpu_valid(sch, prev_cpu, NULL)) return -EINVAL; if (!check_builtin_idle_enabled(sch)) @@ -990,7 +983,7 @@ __bpf_kfunc s32 scx_bpf_cpu_node(s32 cpu, const struct bpf_prog_aux *aux) guard(rcu)(); sch = scx_prog_sched(aux); - if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL)) + if (unlikely(!sch) || !scx_cpu_valid(sch, cpu, NULL)) return NUMA_NO_NODE; return cpu_to_node(cpu); } @@ -1272,7 +1265,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu, const struct bpf_prog_ if (!check_builtin_idle_enabled(sch)) return false; - if (!ops_cpu_valid(sch, cpu, NULL)) + if (!scx_cpu_valid(sch, cpu, NULL)) return false; return scx_idle_test_and_clear_cpu(cpu); @@ -1510,13 +1503,9 @@ static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { int scx_idle_init(void) { - int ret; - - ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || - register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || - register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) || - register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || - register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu); - - return ret; + return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ?: + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu); } diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index a075732d4430..b04701190b23 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -8,35 +8,6 @@ #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) #define SCX_MOFF_IDX(moff) ((moff) / sizeof(void (*)(void))) -enum scx_consts { - SCX_DSP_DFL_MAX_BATCH = 32, - SCX_DSP_MAX_LOOPS = 32, - SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, - - SCX_EXIT_BT_LEN = 64, - SCX_EXIT_MSG_LEN = 1024, - SCX_EXIT_DUMP_DFL_LEN = 32768, - - SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, - - /* - * Iterating all tasks may take a while. Periodically drop - * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. - */ - SCX_TASK_ITER_BATCH = 32, - - SCX_BYPASS_HOST_NTH = 2, - - SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, - SCX_BYPASS_LB_DONOR_PCT = 125, - SCX_BYPASS_LB_MIN_DELTA_DIV = 4, - SCX_BYPASS_LB_BATCH = 256, - - SCX_REENQ_LOCAL_MAX_REPEAT = 256, - - SCX_SUB_MAX_DEPTH = 4, -}; - enum scx_exit_kind { SCX_EXIT_NONE, SCX_EXIT_DONE, @@ -94,6 +65,12 @@ struct scx_exit_info { /* %SCX_EXIT_* - broad category of the exit reason */ enum scx_exit_kind kind; + /* + * CPU that initiated the exit, valid once @kind has been set. + * Negative if the exit path didn't identify a CPU. + */ + s32 exit_cpu; + /* exit code if gracefully exiting */ s64 exit_code; @@ -138,7 +115,8 @@ enum scx_ops_flags { * To mask this problem, by default, unhashed tasks are automatically * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't * depend on pid lookups and wants to handle these tasks directly, the - * following flag can be used. + * following flag can be used. With %SCX_OPS_TID_TO_TASK, + * scx_bpf_tid_to_task() can find exiting tasks reliably. */ SCX_OPS_ENQ_EXITING = 1LLU << 2, @@ -189,6 +167,17 @@ enum scx_ops_flags { */ SCX_OPS_ALWAYS_ENQ_IMMED = 1LLU << 7, + /* + * Maintain a mapping from p->scx.tid to task_struct so the BPF + * scheduler can recover task pointers from stored tids via + * scx_bpf_tid_to_task(). + * + * Only the root scheduler turns this on. A sub-sched may set the flag + * to declare a dependency on the lookup; if the root scheduler hasn't + * enabled it, attaching the sub-sched is rejected. + */ + SCX_OPS_TID_TO_TASK = 1LLU << 8, + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | SCX_OPS_ENQ_EXITING | @@ -196,7 +185,8 @@ enum scx_ops_flags { SCX_OPS_ALLOW_QUEUED_WAKEUP | SCX_OPS_SWITCH_PARTIAL | SCX_OPS_BUILTIN_IDLE_PER_NODE | - SCX_OPS_ALWAYS_ENQ_IMMED, + SCX_OPS_ALWAYS_ENQ_IMMED | + SCX_OPS_TID_TO_TASK, /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, @@ -540,28 +530,6 @@ struct sched_ext_ops { void (*update_idle)(s32 cpu, bool idle); /** - * @cpu_acquire: A CPU is becoming available to the BPF scheduler - * @cpu: The CPU being acquired by the BPF scheduler. - * @args: Acquire arguments, see the struct definition. - * - * A CPU that was previously released from the BPF scheduler is now once - * again under its control. - */ - void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); - - /** - * @cpu_release: A CPU is taken away from the BPF scheduler - * @cpu: The CPU being released by the BPF scheduler. - * @args: Release arguments, see the struct definition. - * - * The specified CPU is no longer under the control of the BPF - * scheduler. This could be because it was preempted by a higher - * priority sched_class, though there may be other reasons as well. The - * caller should consult @args->reason to determine the cause. - */ - void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); - - /** * @init_task: Initialize a task to run in a BPF scheduler * @p: task to initialize for BPF scheduling * @args: init arguments, see the struct definition @@ -851,6 +819,128 @@ struct sched_ext_ops { /* internal use only, must be NULL */ void __rcu *priv; + + /* + * Deprecated callbacks. Kept at the end of the struct so the cid-form + * struct (sched_ext_ops_cid) can omit them without affecting the + * shared field offsets. Use SCX_ENQ_IMMED instead. Sitting past + * SCX_OPI_END means has_op doesn't cover them, so SCX_HAS_OP() cannot + * be used; callers must test sch->ops.cpu_acquire / cpu_release + * directly. + */ + + /** + * @cpu_acquire: A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. Deprecated; use SCX_ENQ_IMMED instead. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * @cpu_release: A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + * Deprecated; use SCX_ENQ_IMMED instead. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); +}; + +/** + * struct sched_ext_ops_cid - cid-form alternative to struct sched_ext_ops + * + * Mirrors struct sched_ext_ops with cpu/cpumask substituted with cid/cmask + * where applicable. Layout up to and including @priv matches sched_ext_ops + * byte-for-byte (verified by BUILD_BUG_ON checks at scx_init() time) so + * shared field offsets work for both struct types in bpf_scx_init_member() + * and bpf_scx_check_member(). The deprecated cpu_acquire/cpu_release + * callbacks at the tail of sched_ext_ops are omitted here entirely. + * + * Differences from sched_ext_ops: + * - select_cpu -> select_cid (returns cid) + * - dispatch -> dispatch (cpu arg is now cid) + * - update_idle -> update_idle (cpu arg is now cid) + * - set_cpumask -> set_cmask (cmask instead of cpumask) + * - cpu_online -> cid_online + * - cpu_offline -> cid_offline + * - dump_cpu -> dump_cid + * - cpu_acquire/cpu_release -> not present (deprecated in sched_ext_ops) + * + * BPF schedulers using this type cannot call cpu-form scx_bpf_* kfuncs; + * use the cid-form variants instead. Enforced at BPF verifier time via + * scx_kfunc_context_filter() branching on prog->aux->st_ops. + * + * See sched_ext_ops for callback documentation. + */ +struct sched_ext_ops_cid { + s32 (*select_cid)(struct task_struct *p, s32 prev_cid, u64 wake_flags); + void (*enqueue)(struct task_struct *p, u64 enq_flags); + void (*dequeue)(struct task_struct *p, u64 deq_flags); + void (*dispatch)(s32 cid, struct task_struct *prev); + void (*tick)(struct task_struct *p); + void (*runnable)(struct task_struct *p, u64 enq_flags); + void (*running)(struct task_struct *p); + void (*stopping)(struct task_struct *p, bool runnable); + void (*quiescent)(struct task_struct *p, u64 deq_flags); + bool (*yield)(struct task_struct *from, struct task_struct *to); + bool (*core_sched_before)(struct task_struct *a, + struct task_struct *b); + void (*set_weight)(struct task_struct *p, u32 weight); + void (*set_cmask)(struct task_struct *p, + const struct scx_cmask *cmask); + void (*update_idle)(s32 cid, bool idle); + s32 (*init_task)(struct task_struct *p, + struct scx_init_task_args *args); + void (*exit_task)(struct task_struct *p, + struct scx_exit_task_args *args); + void (*enable)(struct task_struct *p); + void (*disable)(struct task_struct *p); + void (*dump)(struct scx_dump_ctx *ctx); + void (*dump_cid)(struct scx_dump_ctx *ctx, s32 cid, bool idle); + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); +#ifdef CONFIG_EXT_GROUP_SCHED + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + void (*cgroup_exit)(struct cgroup *cgrp); + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); + void (*cgroup_set_bandwidth)(struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us); + void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle); +#endif /* CONFIG_EXT_GROUP_SCHED */ + s32 (*sub_attach)(struct scx_sub_attach_args *args); + void (*sub_detach)(struct scx_sub_detach_args *args); + void (*cid_online)(s32 cid); + void (*cid_offline)(s32 cid); + s32 (*init)(void); + void (*exit)(struct scx_exit_info *info); + + /* Data fields - must match sched_ext_ops layout exactly */ + u32 dispatch_max_batch; + u64 flags; + u32 timeout_ms; + u32 exit_dump_len; + u64 hotplug_seq; + u64 sub_cgroup_id; + char name[SCX_OPS_NAME_LEN]; + + /* internal use only, must be NULL */ + void __rcu *priv; + + /* layout end anchor for the BUILD_BUG_ON in scx_init(); keep last */ + char __end[0]; }; enum scx_opi { @@ -1009,7 +1099,40 @@ struct scx_sched_pnode { }; struct scx_sched { - struct sched_ext_ops ops; + /* + * cpu-form and cid-form ops share field offsets up to .priv (verified + * by BUILD_BUG_ON in scx_init()). The anonymous union lets the kernel + * access either view of the same storage without function-pointer + * casts: use .ops for cpu-form and shared fields, .ops_cid for the + * cid-renamed callbacks (set_cmask, select_cid, cid_online, ...). + */ + union { + struct sched_ext_ops ops; + struct sched_ext_ops_cid ops_cid; + }; + bool is_cid_type; /* true if registered via bpf_sched_ext_ops_cid */ + + /* + * Arena map auto-discovered from member progs at struct_ops attach. + * cid-form schedulers must use exactly one arena across all member + * progs. NULL on cpu-form. + * + * @arena_pool sub-allocates @arena_map. Each gen_pool chunk is added + * at the kernel-side mapping address. @arena_kern_base is the start + * of the arena's kern_vm range. See scx_arena_to_kaddr() and + * scx_kaddr_to_arena(). + */ + struct bpf_map *arena_map; + struct gen_pool *arena_pool; + uintptr_t arena_kern_base; + + /* + * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask + * to ops_cid.set_cmask(). The kernel writes through the stored kern_va + * and hands BPF its arena pointer via scx_kaddr_to_arena(). + */ + struct scx_cmask * __percpu *set_cmask_scratch; + DECLARE_BITMAP(has_op, SCX_OPI_END); /* @@ -1083,6 +1206,31 @@ struct scx_sched { struct scx_sched *ancestors[]; }; +/** + * scx_arena_to_kaddr - Translate a BPF-arena pointer to its kernel address + * @sch: scheduler whose arena hosts @bpf_ptr + * @bpf_ptr: BPF-arena pointer, only the low 32 bits are used + * + * The (u32) cast normalizes any input into the arena's 4 GiB kern_vm range, + * which combined with scratch-page fault recovery makes the returned pointer + * safe to dereference up to GUARD_SZ / 2 past the intended object. Accesses + * larger than GUARD_SZ / 2 must be explicitly bounds-checked. + */ +static inline void *scx_arena_to_kaddr(struct scx_sched *sch, const void *bpf_ptr) +{ + return (void *)(sch->arena_kern_base + (u32)(uintptr_t)bpf_ptr); +} + +/** + * scx_kaddr_to_arena - Translate a kernel arena address to its BPF form + * @sch: scheduler whose arena hosts @kaddr + * @kaddr: kernel-side arena address, supplied by trusted kernel code + */ +static inline void *scx_kaddr_to_arena(struct scx_sched *sch, const void *kaddr) +{ + return (void *)((uintptr_t)kaddr - sch->arena_kern_base); +} + enum scx_wake_flags { /* expose select WF_* flags as enums */ SCX_WAKE_FORK = WF_FORK, @@ -1366,8 +1514,30 @@ enum scx_ops_state { extern struct scx_sched __rcu *scx_root; DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); +/* + * True when the currently loaded scheduler hierarchy is cid-form. All scheds + * in a hierarchy share one form, so this single key tells callsites which + * view to use without per-sch dereferences. Use scx_is_cid_type() to test. + */ +DECLARE_STATIC_KEY_FALSE(__scx_is_cid_type); + int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id); +bool scx_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where); + +__printf(5, 0) bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, + s64 exit_code, s32 exit_cpu, const char *fmt, + va_list args); +__printf(5, 6) bool __scx_exit(struct scx_sched *sch, enum scx_exit_kind kind, + s64 exit_code, s32 exit_cpu, const char *fmt, ...); + +#define scx_exit(sch, kind, exit_code, fmt, args...) \ + __scx_exit(sch, kind, exit_code, raw_smp_processor_id(), fmt, ##args) +#define scx_error(sch, fmt, args...) \ + scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) +#define scx_verror(sch, fmt, args) \ + scx_vexit((sch), SCX_EXIT_ERROR, 0, raw_smp_processor_id(), fmt, args) + /* * Return the rq currently locked from an scx callback, or NULL if no rq is * locked. @@ -1476,7 +1646,7 @@ static inline bool scx_task_on_sched(struct scx_sched *sch, return true; } -static struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux) +static inline struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux) { return rcu_dereference_all(scx_root); } diff --git a/kernel/sched/ext_types.h b/kernel/sched/ext_types.h new file mode 100644 index 000000000000..8b3527e21fca --- /dev/null +++ b/kernel/sched/ext_types.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Early sched_ext type definitions. + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2026 Tejun Heo <tj@kernel.org> + */ +#ifndef _KERNEL_SCHED_EXT_TYPES_H +#define _KERNEL_SCHED_EXT_TYPES_H + +enum scx_consts { + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, + + /* per-CPU chunk size for p->scx.tid allocation, see scx_alloc_tid() */ + SCX_TID_CHUNK = 1024, + + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_DFL_LEN = 32768, + + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_TASK_ITER_BATCH = 32, + + SCX_BYPASS_HOST_NTH = 2, + + SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, + SCX_BYPASS_LB_DONOR_PCT = 125, + SCX_BYPASS_LB_MIN_DELTA_DIV = 4, + SCX_BYPASS_LB_BATCH = 256, + + SCX_REENQ_LOCAL_MAX_REPEAT = 256, + + SCX_SUB_MAX_DEPTH = 4, +}; + +/* + * Per-cid topology info. For each topology level (core, LLC, node), records + * the first cid in the unit and its global index. Global indices are + * consecutive integers assigned in cid-walk order, so e.g. core_idx ranges + * over [0, nr_cores_at_init) with no gaps. No-topo cids have all fields set + * to -1. + * + * @core_cid: first cid of this cid's core (smt-sibling group) + * @core_idx: global index of that core, in [0, nr_cores_at_init) + * @llc_cid: first cid of this cid's LLC + * @llc_idx: global index of that LLC, in [0, nr_llcs_at_init) + * @node_cid: first cid of this cid's NUMA node + * @node_idx: global index of that node, in [0, nr_nodes_at_init) + */ +struct scx_cid_topo { + s32 core_cid; + s32 core_idx; + s32 llc_cid; + s32 llc_idx; + s32 node_cid; + s32 node_idx; +}; + +/* + * cmask: variable-length, base-windowed bitmap over cid space + * ----------------------------------------------------------- + * + * A cmask covers the cid range [base, base + nr_cids). bits[] is aligned to the + * global 64-cid grid: bits[0] spans [base & ~63, (base & ~63) + 64), so the + * first (base & 63) bits of bits[0] are head padding and the trailing bits of + * the last active word past base + nr_cids are tail padding. Both stay zero; + * all mutating helpers preserve that. Words past the last active word are not + * read by any helper and have no constraint. + * + * Grid alignment means two cmasks always address bits[] against the same global + * 64-cid windows, so cross-cmask word ops (AND, OR, ...) reduce to + * + * dst->bits[i] OP= src->bits[i - delta] + * + * with no bit-shifting, regardless of how the two bases relate mod 64. + */ +struct scx_cmask { + u32 base; + u32 nr_cids; + u32 alloc_words; + u64 bits[] __counted_by(alloc_words); +}; + +/* + * Number of u64 words of bits[] storage that covers @nr_cids regardless of base + * alignment. The +1 absorbs up to 63 bits of head padding when base is not + * 64-aligned - always allocating one extra word beats branching on base or + * splitting the compute. The u64 cast keeps the +63 from wrapping when @nr_cids + * is near U32_MAX, so callers bounds-checking the result against @alloc_words + * catch the overflow instead of seeing a small value. + */ +#define SCX_CMASK_NR_WORDS(nr_cids) ((u32)(((u64)(nr_cids) + 63) / 64 + 1)) + +/** + * __SCX_CMASK_DEFINE - Define an on-stack cmask with explicit storage capacity + * @NAME: variable name to define + * @BASE: first cid of the active range + * @NR_CIDS: active range length + * @ALLOC_CIDS: storage capacity in cids, at least @NR_CIDS + * + * @NAME aliases zero-initialized storage with the active range set to + * [BASE, BASE + NR_CIDS). Use scx_cmask_reframe() to reshape later, up to + * @ALLOC_CIDS. + */ +#define __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, ALLOC_CIDS) \ + _DEFINE_FLEX(struct scx_cmask, NAME, bits, SCX_CMASK_NR_WORDS(ALLOC_CIDS), \ + = { .base = (BASE), \ + .nr_cids = (NR_CIDS), \ + .alloc_words = SCX_CMASK_NR_WORDS(ALLOC_CIDS) }) + +/** + * SCX_CMASK_DEFINE - Define an on-stack cmask on tight storage + * @NAME: variable name to define + * @BASE: first cid of the active range + * @NR_CIDS: active range length, also storage capacity + * + * @NAME aliases zero-initialized storage with the active range and storage + * both [BASE, BASE + NR_CIDS). + */ +#define SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS) \ + __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, NR_CIDS) + +/** + * SCX_CMASK_DEFINE_SHARD - Define an on-stack cmask sized to one shard + * @NAME: variable name to define + * @BASE: first cid of the active range + * @NR_CIDS: active range length, must be <= SCX_CID_SHARD_MAX_CPUS + * + * Storage is fixed at SCX_CID_SHARD_MAX_CPUS, active range framed by + * (BASE, NR_CIDS). Passing NR_CIDS > SCX_CID_SHARD_MAX_CPUS leaves the + * cmask claiming more bits than storage holds and subsequent cmask + * operations will overrun. + */ +#define SCX_CMASK_DEFINE_SHARD(NAME, BASE, NR_CIDS) \ + __SCX_CMASK_DEFINE(NAME, BASE, NR_CIDS, SCX_CID_SHARD_MAX_CPUS) + +#endif /* _KERNEL_SCHED_EXT_TYPES_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 728965851842..d78467ec6ee1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -334,7 +334,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) * to a tree or when we reach the top of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + tg_cfs_rq(cfs_rq->tg->parent, cpu)->on_list) { /* * If parent is already on the list, we add the child * just before. Thanks to circular linked property of @@ -342,7 +342,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) * of the list that starts by parent. */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + &(tg_cfs_rq(cfs_rq->tg->parent, cpu)->leaf_cfs_rq_list)); /* * The branch is now connected to its tree so we can * reset tmp_alone_branch to the beginning of the @@ -525,7 +525,7 @@ static int se_is_idle(struct sched_entity *se) #endif /* !CONFIG_FAIR_GROUP_SCHED */ static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); +bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -882,11 +882,11 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * * lag_i >= 0 -> V >= v_i * - * \Sum (v_i - v)*w_i - * V = ------------------ + v + * \Sum (v_i - v0)*w_i + * V = ------------------- + v0 * \Sum w_i * - * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) + * lag_i >= 0 -> \Sum (v_i - v0)*w_i >= (v_i - v0)*(\Sum w_i) * * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division. @@ -894,7 +894,7 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->sum_w_vruntime; + s64 key, avg = cfs_rq->sum_w_vruntime; long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { @@ -904,7 +904,36 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load; + key = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); + + /* + * The worst case term for @key includes 'NSEC_TICK * NICE_0_LOAD' + * and @load obviously includes NICE_0_LOAD. NSEC_TICK is around 24 + * bits, while NICE_0_LOAD is 20 on 64bit and 10 otherwise. + * + * This gives that on 64bit the product will be at least 64bit which + * overflows s64, while on 32bit it will only be 44bits and should fit + * comfortably. + */ +#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_SUPPORTS_INT128 + /* This often results in simpler code than __builtin_mul_overflow(). */ + return avg >= (__int128)key * load; +#else + s64 rhs; + /* + * On overflow, the sign of key tells us the correct answer: a large + * positive key means vruntime >> V, so not eligible; a large negative + * key means vruntime << V, so eligible. + */ + if (check_mul_overflow(key, load, &rhs)) + return key <= 0; + + return avg >= rhs; +#endif +#else /* 32bit */ + return avg >= key * load; +#endif } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1321,6 +1350,8 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } +static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec); + static s64 update_se(struct rq *rq, struct sched_entity *se) { u64 now = rq_clock_task(rq); @@ -1343,6 +1374,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) trace_sched_stat_runtime(running, delta_exec); account_group_exec_runtime(running, delta_exec); + account_mm_sched(rq, running, delta_exec); /* cgroup time is always accounted against the donor */ cgroup_account_cputime(donor, delta_exec); @@ -1364,6 +1396,581 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) static void set_next_buddy(struct sched_entity *se); +#ifdef CONFIG_SCHED_CACHE + +/* + * XXX numbers come from a place the sun don't shine -- probably wants to be SD + * tunable or so. + */ +#define EPOCH_PERIOD (HZ / 100) /* 10 ms */ +#define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */ +__read_mostly unsigned int llc_aggr_tolerance = 1; +__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD; +__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT; +__read_mostly unsigned int llc_imb_pct = 20; +__read_mostly unsigned int llc_overaggr_pct = 50; + +static int llc_id(int cpu) +{ + if (cpu < 0) + return -1; + + return per_cpu(sd_llc_id, cpu); +} + +static inline int get_sched_cache_scale(int mul) +{ + unsigned int tol = READ_ONCE(llc_aggr_tolerance); + + if (!tol) + return 0; + + if (tol >= 100) + return INT_MAX; + + return (1 + (tol - 1) * mul); +} + +static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) +{ +#ifdef CONFIG_NUMA_BALANCING + unsigned long llc, footprint; + struct sched_domain *sd; + int scale; + + guard(rcu)(); + + sd = rcu_dereference_sched_domain(cpu_rq(cpu)->sd); + if (!sd) + return true; + + if (static_branch_likely(&sched_numa_balancing)) { + /* + * TBD: RDT exclusive LLC ways reserved should be + * excluded. + */ + llc = sd->llc_bytes; + footprint = READ_ONCE(mm->sc_stat.footprint); + + /* + * Scale the LLC size by 256*llc_aggr_tolerance + * and compare it to the task's footprint. + * + * Suppose the L3 size is 32MB. If the + * llc_aggr_tolerance is 1: + * When the footprint is larger than 32MB, the + * process is regarded as exceeding the LLC + * capacity. If the llc_aggr_tolerance is 99: + * When the footprint is larger than 784GB, the + * process is regarded as exceeding the LLC + * capacity: + * 784GB = (1 + (99 - 1) * 256) * 32MB + * If the llc_aggr_tolerance is 100: + * ignore the footprint and do the aggregation + * anyway. + */ + scale = get_sched_cache_scale(256); + if (scale == INT_MAX) + return false; + + return ((llc * (u64)scale) < (footprint * PAGE_SIZE)); + } +#endif + return false; +} + +static bool invalid_llc_nr(struct mm_struct *mm, struct task_struct *p, + int cpu) +{ + int scale; + + if (get_nr_threads(p) <= 1) + return true; + + /* + * Scale the number of 'cores' in a LLC by llc_aggr_tolerance + * and compare it to the task's active threads. + */ + scale = get_sched_cache_scale(1); + if (scale == INT_MAX) + return false; + + return !fits_capacity((mm->sc_stat.nr_running_avg * cpu_smt_num_threads), + (scale * per_cpu(sd_llc_size, cpu))); +} + +static void account_llc_enqueue(struct rq *rq, struct task_struct *p) +{ + int pref_llc, pref_llc_queued; + struct sched_domain *sd; + + pref_llc = p->preferred_llc; + if (pref_llc < 0) + return; + + pref_llc_queued = (pref_llc == task_llc(p)); + rq->nr_llc_running++; + rq->nr_pref_llc_running += pref_llc_queued; + + /* + * Record whether p is enqueued on its preferred + * LLC, in order to pair with account_llc_dequeue() + * to maintain a consistent nr_pref_llc_running per + * runqueue. + * This is necessary because a race condition exists: + * after a task is enqueued on a runqueue, task_llc(p) + * may change due to CPU hotplug. Therefore, checking + * task_llc(p) to determine whether the task is being + * dequeued from its preferred LLC is unreliable and + * can cause inconsistent values - checking the + * p->pref_llc_queued in account_llc_dequeue() would + * be reliable. + */ + p->pref_llc_queued = pref_llc_queued; + + sd = rcu_dereference_all(rq->sd); + if (sd && (unsigned int)pref_llc < sd->llc_max) + sd->llc_counts[pref_llc]++; +} + +static void account_llc_dequeue(struct rq *rq, struct task_struct *p) +{ + struct sched_domain *sd; + int pref_llc; + + pref_llc = p->preferred_llc; + if (pref_llc < 0) + return; + + rq->nr_llc_running--; + if (p->pref_llc_queued) { + rq->nr_pref_llc_running--; + /* + * Update the status in case + * other logic might query + * this. + */ + p->pref_llc_queued = 0; + } + + sd = rcu_dereference_all(rq->sd); + if (sd && (unsigned int)pref_llc < sd->llc_max) { + /* + * There is a race condition between dequeue + * and CPU hotplug. After a task has been enqueued + * on CPUx, a CPU hotplug event occurs, and all online + * CPUs (including CPUx) rebuild their sched_domains + * and reset statistics to zero(including sd->llc_counts). + * This can cause temporary undercount and we have to + * check for such underflow in sd->llc_counts. + * + * This undercount is temporary and accurate accounting + * will resume once the rq has a chance to be idle. + */ + if (sd->llc_counts[pref_llc]) + sd->llc_counts[pref_llc]--; + } +} + +void mm_init_sched(struct mm_struct *mm, + struct sched_cache_time __percpu *_pcpu_sched) +{ + unsigned long epoch = 0; + int i; + + for_each_possible_cpu(i) { + struct sched_cache_time *pcpu_sched = per_cpu_ptr(_pcpu_sched, i); + struct rq *rq = cpu_rq(i); + + pcpu_sched->runtime = 0; + /* a slightly stale cpu epoch is acceptible */ + pcpu_sched->epoch = rq->cpu_epoch; + epoch = rq->cpu_epoch; + } + + raw_spin_lock_init(&mm->sc_stat.lock); + mm->sc_stat.epoch = epoch; + mm->sc_stat.cpu = -1; + mm->sc_stat.next_scan = jiffies; + mm->sc_stat.nr_running_avg = 0; + mm->sc_stat.footprint = 0; + /* + * The update to mm->sc_stat should not be reordered + * before initialization to mm's other fields, in case + * the readers may get invalid mm_sched_epoch, etc. + */ + smp_store_release(&mm->sc_stat.pcpu_sched, _pcpu_sched); +} + +/* because why would C be fully specified */ +static __always_inline void __shr_u64(u64 *val, unsigned int n) +{ + if (n >= 64) { + *val = 0; + return; + } + *val >>= n; +} + +static inline void __update_mm_sched(struct rq *rq, + struct sched_cache_time *pcpu_sched) +{ + lockdep_assert_held(&rq->cpu_epoch_lock); + + unsigned int period = max(READ_ONCE(llc_epoch_period), 1U); + unsigned long n, now = jiffies; + long delta = now - rq->cpu_epoch_next; + + if (delta > 0) { + n = (delta + period - 1) / period; + rq->cpu_epoch += n; + rq->cpu_epoch_next += n * period; + __shr_u64(&rq->cpu_runtime, n); + } + + n = rq->cpu_epoch - pcpu_sched->epoch; + if (n) { + pcpu_sched->epoch += n; + __shr_u64(&pcpu_sched->runtime, n); + } +} + +static unsigned long fraction_mm_sched(struct rq *rq, + struct sched_cache_time *pcpu_sched) +{ + guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock); + + __update_mm_sched(rq, pcpu_sched); + + /* + * Runtime is a geometric series (r=0.5) and as such will sum to twice + * the accumulation period, this means the multiplcation here should + * not overflow. + */ + return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); +} + +static int get_pref_llc(struct task_struct *p, struct mm_struct *mm) +{ + int mm_sched_llc = -1, mm_sched_cpu; + + if (!mm) + return -1; + + mm_sched_cpu = READ_ONCE(mm->sc_stat.cpu); + if (mm_sched_cpu != -1) { + mm_sched_llc = llc_id(mm_sched_cpu); + +#ifdef CONFIG_NUMA_BALANCING + /* + * Don't assign preferred LLC if it + * conflicts with NUMA balancing. + * This can happen when sched_setnuma() gets + * called, however it is not much of an issue + * because we expect account_mm_sched() to get + * called fairly regularly -- at a higher rate + * than sched_setnuma() at least -- and thus the + * conflict only exists for a short period of time. + */ + if (static_branch_likely(&sched_numa_balancing) && + p->numa_preferred_nid >= 0 && + cpu_to_node(mm_sched_cpu) != p->numa_preferred_nid) + mm_sched_llc = -1; +#endif + } + + return mm_sched_llc; +} + +static unsigned int task_running_on_cpu(int cpu, struct task_struct *p); + +static inline +void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) +{ + struct sched_cache_time *pcpu_sched; + struct mm_struct *mm = p->mm; + int mm_sched_llc = -1; + unsigned long epoch; + + if (!sched_cache_enabled()) + return; + + if (p->sched_class != &fair_sched_class) + return; + /* + * init_task, kthreads and user thread created + * by user_mode_thread() don't have mm. + */ + if (!mm || !mm->sc_stat.pcpu_sched) + return; + + pcpu_sched = per_cpu_ptr(mm->sc_stat.pcpu_sched, cpu_of(rq)); + + scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) { + __update_mm_sched(rq, pcpu_sched); + pcpu_sched->runtime += delta_exec; + rq->cpu_runtime += delta_exec; + epoch = rq->cpu_epoch; + } + + /* + * If this process hasn't hit task_cache_work() for a while invalidate + * its preferred state. + */ + if ((long)(epoch - READ_ONCE(mm->sc_stat.epoch)) > llc_epoch_affinity_timeout || + invalid_llc_nr(mm, p, cpu_of(rq)) || + exceed_llc_capacity(mm, cpu_of(rq))) { + if (READ_ONCE(mm->sc_stat.cpu) != -1) + WRITE_ONCE(mm->sc_stat.cpu, -1); + } + + mm_sched_llc = get_pref_llc(p, mm); + + /* task not on rq accounted later in account_entity_enqueue() */ + if (task_running_on_cpu(rq->cpu, p) && + READ_ONCE(p->preferred_llc) != mm_sched_llc) { + account_llc_dequeue(rq, p); + WRITE_ONCE(p->preferred_llc, mm_sched_llc); + account_llc_enqueue(rq, p); + } +} + +static void task_tick_cache(struct rq *rq, struct task_struct *p) +{ + struct callback_head *work = &p->cache_work; + struct mm_struct *mm = p->mm; + unsigned long epoch; + + if (!sched_cache_enabled()) + return; + + if (!mm || p->flags & PF_KTHREAD || + !mm->sc_stat.pcpu_sched) + return; + + epoch = rq->cpu_epoch; + /* avoid moving backwards */ + if (time_after_eq(mm->sc_stat.epoch, epoch)) + return; + + guard(raw_spinlock)(&mm->sc_stat.lock); + + if (work->next == work) { + task_work_add(p, work, TWA_RESUME); + WRITE_ONCE(mm->sc_stat.epoch, epoch); + } +} + +static void get_scan_cpumasks(cpumask_var_t cpus, struct task_struct *p) +{ +#ifdef CONFIG_NUMA_BALANCING + int cpu, curr_cpu, nid, pref_nid; + + if (!static_branch_likely(&sched_numa_balancing)) + goto out; + + cpu = READ_ONCE(p->mm->sc_stat.cpu); + if (cpu != -1) + nid = cpu_to_node(cpu); + curr_cpu = task_cpu(p); + + /* + * Scanning in the preferred NUMA node is ideal. However, the NUMA + * preferred node is per-task rather than per-process. It is possible + * for different threads of the process to have distinct preferred + * nodes; consequently, the process-wide preferred LLC may bounce + * between different nodes. As a workaround, maintain the scan + * CPU mask to also cover the process's current preferred LLC and the + * current running node to mitigate the bouncing risk. + * TBD: numa_group should be considered during task aggregation. + */ + pref_nid = p->numa_preferred_nid; + /* honor the task's preferred node */ + if (pref_nid == NUMA_NO_NODE) + goto out; + + cpumask_or(cpus, cpus, cpumask_of_node(pref_nid)); + + /* honor the task's preferred LLC CPU */ + if (cpu != -1 && !cpumask_test_cpu(cpu, cpus) && nid != NUMA_NO_NODE) + cpumask_or(cpus, cpus, cpumask_of_node(nid)); + + /* make sure the task's current running node is included */ + if (!cpumask_test_cpu(curr_cpu, cpus)) + cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu))); + + return; + +out: +#endif + cpumask_copy(cpus, cpu_online_mask); +} + +static inline void update_avg_scale(u64 *avg, u64 sample) +{ + int factor = per_cpu(sd_llc_size, raw_smp_processor_id()); + s64 diff = sample - *avg; + u32 divisor; + + /* + * Scale the divisor based on the number of CPUs contained + * in the LLC. This scaling ensures smaller LLC domains use + * a smaller divisor to achieve more precise sensitivity to + * changes in nr_running, while larger LLC domains are capped + * at a maximum divisor of 8 which is the default smoothing + * factor of EWMA in update_avg(). + */ + divisor = clamp_t(u32, (factor >> 2), 2, 8); + *avg += div64_s64(diff, divisor); +} + +static void task_cache_work(struct callback_head *work) +{ + int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu; + unsigned long next_scan, now = jiffies; + struct task_struct *p = current, *cur; + unsigned long curr_m_a_occ = 0; + struct mm_struct *mm = p->mm; + unsigned long m_a_occ = 0; + cpumask_var_t cpus; + + WARN_ON_ONCE(work != &p->cache_work); + + work->next = work; + + if (p->flags & PF_EXITING) + return; + + next_scan = READ_ONCE(mm->sc_stat.next_scan); + if (time_before(now, next_scan)) + return; + + /* only 1 thread is allowed to scan */ + if (!try_cmpxchg(&mm->sc_stat.next_scan, &next_scan, + now + max_t(unsigned long, + READ_ONCE(llc_epoch_period), 1))) + return; + + curr_cpu = task_cpu(p); + if (invalid_llc_nr(mm, p, curr_cpu) || + exceed_llc_capacity(mm, curr_cpu)) { + if (READ_ONCE(mm->sc_stat.cpu) != -1) + WRITE_ONCE(mm->sc_stat.cpu, -1); + + return; + } + + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return; + + scoped_guard (cpus_read_lock) { + guard(rcu)(); + + get_scan_cpumasks(cpus, p); + + for_each_cpu(cpu, cpus) { + /* XXX sched_cluster_active */ + struct sched_domain *sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); + unsigned long occ, m_occ = 0, a_occ = 0; + int m_cpu = -1, i; + + if (!sd) + continue; + + for_each_cpu(i, sched_domain_span(sd)) { + occ = fraction_mm_sched(cpu_rq(i), + per_cpu_ptr(mm->sc_stat.pcpu_sched, i)); + a_occ += occ; + if (occ > m_occ) { + m_occ = occ; + m_cpu = i; + } + + cur = rcu_dereference_all(cpu_rq(i)->curr); + if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) && + cur->mm == mm) + nr_running++; + } + + /* + * Compare the accumulated occupancy of each LLC. The + * reason for using accumulated occupancy rather than average + * per CPU occupancy is that it works better in asymmetric LLC + * scenarios. + * For example, if there are 2 threads in a 4CPU LLC and 3 + * threads in an 8CPU LLC, it might be better to choose the one + * with 3 threads. However, this would not be the case if the + * occupancy is divided by the number of CPUs in an LLC (i.e., + * if average per CPU occupancy is used). + * Besides, NUMA balancing fault statistics behave similarly: + * the total number of faults per node is compared rather than + * the average number of faults per CPU. This strategy is also + * followed here. + */ + if (a_occ > m_a_occ) { + m_a_occ = a_occ; + m_a_cpu = m_cpu; + } + + if (llc_id(cpu) == llc_id(READ_ONCE(mm->sc_stat.cpu))) + curr_m_a_occ = a_occ; + + cpumask_andnot(cpus, cpus, sched_domain_span(sd)); + } + } + + if (m_a_occ > (2 * curr_m_a_occ)) { + /* + * Avoid switching sc_stat.cpu too fast. + * The reason to choose 2X is because: + * 1. It is better to keep the preferred LLC stable, + * rather than changing it frequently and cause migrations + * 2. 2X means the new preferred LLC has at least 1 more + * busy CPU than the old one(200% vs 100%, eg) + * 3. 2X is chosen based on test results, as it delivers + * the optimal performance gain so far. + */ + WRITE_ONCE(mm->sc_stat.cpu, m_a_cpu); + } + + update_avg_scale(&mm->sc_stat.nr_running_avg, nr_running); + free_cpumask_var(cpus); +} + +void init_sched_mm(struct task_struct *p) +{ + struct callback_head *work = &p->cache_work; + + init_task_work(work, task_cache_work); + work->next = work; + /* + * Reset new task's preference to avoid + * polluting account_llc_enqueue(). + */ + p->preferred_llc = -1; +} + +#else /* CONFIG_SCHED_CACHE */ + +static inline void account_mm_sched(struct rq *rq, struct task_struct *p, + s64 delta_exec) { } + +void init_sched_mm(struct task_struct *p) { } + +static void task_tick_cache(struct rq *rq, struct task_struct *p) { } + +static inline int get_pref_llc(struct task_struct *p, + struct mm_struct *mm) +{ + return -1; +} + +static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {} + +static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {} + +#endif /* CONFIG_SCHED_CACHE */ + /* * Used by other classes to account runtime. */ @@ -1549,13 +2156,9 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) se->exec_start = rq_clock_task(rq_of(cfs_rq)); } -/************************************************** - * Scheduling class queueing methods: - */ - +/* Check sched_smt_active before calling this to avoid overheads in fastpaths */ static inline bool is_core_idle(int cpu) { -#ifdef CONFIG_SCHED_SMT int sibling; for_each_cpu(sibling, cpu_smt_mask(cpu)) { @@ -1565,7 +2168,6 @@ static inline bool is_core_idle(int cpu) if (!idle_cpu(sibling)) return false; } -#endif return true; } @@ -2248,12 +2850,11 @@ numa_type numa_classify(unsigned int imbalance_pct, return node_fully_busy; } -#ifdef CONFIG_SCHED_SMT /* Forward declarations of select_idle_sibling helpers */ static inline bool test_idle_cores(int cpu); static inline int numa_idle_core(int idle_core, int cpu) { - if (!static_branch_likely(&sched_smt_present) || + if (!sched_smt_active() || idle_core >= 0 || !test_idle_cores(cpu)) return idle_core; @@ -2266,12 +2867,6 @@ static inline int numa_idle_core(int idle_core, int cpu) return idle_core; } -#else /* !CONFIG_SCHED_SMT: */ -static inline int numa_idle_core(int idle_core, int cpu) -{ - return idle_core; -} -#endif /* !CONFIG_SCHED_SMT */ /* * Gather all necessary information to make NUMA balancing placement @@ -3050,6 +3645,7 @@ static void task_numa_placement(struct task_struct *p) unsigned long total_faults; u64 runtime, period; spinlock_t *group_lock = NULL; + long __maybe_unused new_fp; struct numa_group *ng; /* @@ -3124,6 +3720,31 @@ static void task_numa_placement(struct task_struct *p) ng->total_faults += diff; group_faults += ng->faults[mem_idx]; } +#ifdef CONFIG_SCHED_CACHE + /* + * Per task p->numa_faults[mem_idx] converges, + * so the accumulation of each task's faults + * converges too - Given the number of threads, + * it cannot overflow an unsigned long. + * Racy with concurrent updates from other threads + * sharing this mm. Acceptable since footprint is a + * heuristic and occasional lost updates are tolerable. + * + * If a task exits, its corresponding footprint must + * be subtracted from the mm->sc_stat.footprint, otherwise + * the mm->sc_stat.footprint will not converge: + * the exiting thread's footprint remains unchanged/undecayed + * in mm->sc_stat.footprint. See exit_mm(). + * + * Lost updates and unsynchronized subtraction + * in exit_mm() can cause footprint + diff to + * go negative. Clamp to zero to prevent the + * unsigned footprint from wrapping. + */ + new_fp = (long)READ_ONCE(p->mm->sc_stat.footprint) + diff; + WRITE_ONCE(p->mm->sc_stat.footprint, + max(new_fp, 0L)); +#endif } if (!ng) { @@ -3848,9 +4469,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); if (entity_is_task(se)) { + struct task_struct *p = task_of(se); struct rq *rq = rq_of(cfs_rq); - account_numa_enqueue(rq, task_of(se)); + account_numa_enqueue(rq, p); + account_llc_enqueue(rq, p); list_add(&se->group_node, &rq->cfs_tasks); } cfs_rq->nr_queued++; @@ -3861,7 +4484,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); if (entity_is_task(se)) { - account_numa_dequeue(rq_of(cfs_rq), task_of(se)); + struct task_struct *p = task_of(se); + struct rq *rq = rq_of(cfs_rq); + + account_numa_dequeue(rq, p); + account_llc_dequeue(rq, p); list_del_init(&se->group_node); } cfs_rq->nr_queued--; @@ -4364,7 +4991,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) * For migration heavy workloads, access to tg->load_avg can be * unbound. Limit the update rate to at most once per ms. */ - now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + now = rq_clock(rq_of(cfs_rq)); if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC) return; @@ -4387,7 +5014,7 @@ static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq) if (cfs_rq->tg == &root_task_group) return; - now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + now = rq_clock(rq_of(cfs_rq)); delta = 0 - cfs_rq->tg_load_avg_contrib; atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = 0; @@ -4408,13 +5035,13 @@ static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) */ rq_clock_start_loop_update(rq); - rcu_read_lock(); + guard(rcu)(); + list_for_each_entry_rcu(tg, &task_groups, list) { - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); clear_tg_load_avg(cfs_rq); } - rcu_read_unlock(); rq_clock_stop_loop_update(rq); } @@ -4930,13 +5557,86 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s trace_pelt_cfs_tp(cfs_rq); } +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) + +static inline void util_est_update(struct sched_entity *se) +{ + unsigned int ewma, dequeued, last_ewma_diff; + + if (!sched_feat(UTIL_EST)) + return; + + /* Get current estimate of utilization */ + ewma = READ_ONCE(se->avg.util_est); + + /* + * If the PELT values haven't changed since enqueue time, + * skip the util_est update. + */ + if (ewma & UTIL_AVG_UNCHANGED) + return; + + /* Get utilization at dequeue */ + dequeued = READ_ONCE(se->avg.util_avg); + + /* + * Reset EWMA on utilization increases, the moving average is used only + * to smooth utilization decreases. + */ + if (ewma <= dequeued) { + ewma = dequeued; + goto done; + } + + /* + * Skip update of task's estimated utilization when its members are + * already ~1% close to its last activation value. + */ + last_ewma_diff = ewma - dequeued; + if (last_ewma_diff < UTIL_EST_MARGIN) + goto done; + + /* + * To avoid underestimate of task utilization, skip updates of EWMA if + * we cannot grant that thread got all CPU time it wanted. + */ + if ((dequeued + UTIL_EST_MARGIN) < READ_ONCE(se->avg.runnable_avg)) + goto done; + + /* + * Update Task's estimated utilization + * + * When *p completes an activation we can consolidate another sample + * of the task size. This is done by using this value to update the + * Exponential Weighted Moving Average (EWMA): + * + * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) + * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) + * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) + * = w * ( -last_ewma_diff ) + ewma(t-1) + * = w * (-last_ewma_diff + ewma(t-1) / w) + * + * Where 'w' is the weight of new samples, which is configured to be + * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) + */ + ewma <<= UTIL_EST_WEIGHT_SHIFT; + ewma -= last_ewma_diff; + ewma >>= UTIL_EST_WEIGHT_SHIFT; +done: + ewma |= UTIL_AVG_UNCHANGED; + WRITE_ONCE(se->avg.util_est, ewma); + + trace_sched_util_est_se_tp(se); +} + /* * Optional action to be done while updating the load average */ -#define UPDATE_TG 0x1 -#define SKIP_AGE_LOAD 0x2 -#define DO_ATTACH 0x4 -#define DO_DETACH 0x8 +#define UPDATE_TG 0x01 +#define SKIP_AGE_LOAD 0x02 +#define DO_ATTACH 0x04 +#define DO_DETACH 0x08 +#define UPDATE_UTIL_EST 0x10 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -4979,6 +5679,9 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s if (flags & UPDATE_TG) update_tg_load_avg(cfs_rq); } + + if (flags & UPDATE_UTIL_EST) + util_est_update(se); } /* @@ -5037,11 +5740,6 @@ static inline unsigned long task_util(struct task_struct *p) return READ_ONCE(p->se.avg.util_avg); } -static inline unsigned long task_runnable(struct task_struct *p) -{ - return READ_ONCE(p->se.avg.runnable_avg); -} - static inline unsigned long _task_util_est(struct task_struct *p) { return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED; @@ -5084,88 +5782,6 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq, trace_sched_util_est_cfs_tp(cfs_rq); } -#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) - -static inline void util_est_update(struct cfs_rq *cfs_rq, - struct task_struct *p, - bool task_sleep) -{ - unsigned int ewma, dequeued, last_ewma_diff; - - if (!sched_feat(UTIL_EST)) - return; - - /* - * Skip update of task's estimated utilization when the task has not - * yet completed an activation, e.g. being migrated. - */ - if (!task_sleep) - return; - - /* Get current estimate of utilization */ - ewma = READ_ONCE(p->se.avg.util_est); - - /* - * If the PELT values haven't changed since enqueue time, - * skip the util_est update. - */ - if (ewma & UTIL_AVG_UNCHANGED) - return; - - /* Get utilization at dequeue */ - dequeued = task_util(p); - - /* - * Reset EWMA on utilization increases, the moving average is used only - * to smooth utilization decreases. - */ - if (ewma <= dequeued) { - ewma = dequeued; - goto done; - } - - /* - * Skip update of task's estimated utilization when its members are - * already ~1% close to its last activation value. - */ - last_ewma_diff = ewma - dequeued; - if (last_ewma_diff < UTIL_EST_MARGIN) - goto done; - - /* - * To avoid underestimate of task utilization, skip updates of EWMA if - * we cannot grant that thread got all CPU time it wanted. - */ - if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p)) - goto done; - - - /* - * Update Task's estimated utilization - * - * When *p completes an activation we can consolidate another sample - * of the task size. This is done by using this value to update the - * Exponential Weighted Moving Average (EWMA): - * - * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) - * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) - * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) - * = w * ( -last_ewma_diff ) + ewma(t-1) - * = w * (-last_ewma_diff + ewma(t-1) / w) - * - * Where 'w' is the weight of new samples, which is configured to be - * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) - */ - ewma <<= UTIL_EST_WEIGHT_SHIFT; - ewma -= last_ewma_diff; - ewma >>= UTIL_EST_WEIGHT_SHIFT; -done: - ewma |= UTIL_AVG_UNCHANGED; - WRITE_ONCE(p->se.avg.util_est, ewma); - - trace_sched_util_est_se_tp(&p->se); -} - static inline unsigned long get_actual_cpu_capacity(int cpu) { unsigned long capacity = arch_scale_cpu_capacity(cpu); @@ -5618,7 +6234,7 @@ static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { bool sleep = flags & DEQUEUE_SLEEP; - int action = UPDATE_TG; + int action = 0; update_curr(cfs_rq); clear_buddies(cfs_rq, se); @@ -5638,15 +6254,23 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { - update_load_avg(cfs_rq, se, 0); + if (entity_is_task(se)) + action |= UPDATE_UTIL_EST; + update_load_avg(cfs_rq, se, action); update_entity_lag(cfs_rq, se); set_delayed(se); return false; } } - if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) - action |= DO_DETACH; + action = UPDATE_TG; + if (entity_is_task(se)) { + if (task_on_rq_migrating(task_of(se))) + action |= DO_DETACH; + + if (sleep && !(flags & DEQUEUE_DELAYED)) + action |= UPDATE_UTIL_EST; + } /* * When dequeuing a sched_entity, we must: @@ -5764,8 +6388,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect) return se; } -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); - static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { /* @@ -5775,9 +6397,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) if (prev->on_rq) update_curr(cfs_rq); - /* throttle cfs_rqs exceeding runtime */ - check_cfs_rq_runtime(cfs_rq); - if (prev->on_rq) { update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ @@ -5912,44 +6531,32 @@ static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, return cfs_rq->runtime_remaining > 0; } -/* returns 0 on failure to allocate runtime */ -static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - int ret; - - raw_spin_lock(&cfs_b->lock); - ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice()); - raw_spin_unlock(&cfs_b->lock); +static bool throttle_cfs_rq(struct cfs_rq *cfs_rq); - return ret; -} - -static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) +static bool __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; if (likely(cfs_rq->runtime_remaining > 0)) - return; + return false; if (cfs_rq->throttled) - return; + return true; /* - * if we're unable to extend our runtime we resched so that the active - * hierarchy can be throttled + * throttle_cfs_rq() will try to extend the runtime first + * before throttling the hierarchy. */ - if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_curr(rq_of(cfs_rq)); + return throttle_cfs_rq(cfs_rq); } static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) +bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) - return; + return false; - __account_cfs_rq_runtime(cfs_rq, delta_exec); + return __account_cfs_rq_runtime(cfs_rq, delta_exec); } static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) @@ -5970,7 +6577,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) { - return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); + return throttled_hierarchy(tg_cfs_rq(task_group(p), dst_cpu)); } static inline bool task_is_throttled(struct task_struct *p) @@ -6116,8 +6723,18 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); struct task_struct *p, *tmp; + LIST_HEAD(throttled_tasks); + + /* + * If cfs_rq->curr is set, the cfs_rq might not have caught up + * since the last clock update. Do it now before we begin + * queueing task onto it to save the need for unnecessarily + * unthrottle the hierarchy for this cfs_rq to be throttled + * right back again. + */ + update_curr(cfs_rq); if (--cfs_rq->throttle_count) return 0; @@ -6139,13 +6756,31 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttled_clock_self_time += delta; } + /* + * Move the tasks to a local list since an update_curr() during + * enqueue_task_fair() can throttle a higher cfs_rq, and it can + * see the "throttled_limbo_list" being non-empty in + * tg_throttle_down() if throttle_count turned 0 above. + */ + list_splice_init(&cfs_rq->throttled_limbo_list, &throttled_tasks); + /* Re-enqueue the tasks that have been throttled at this level. */ - list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { + list_for_each_entry_safe(p, tmp, &throttled_tasks, throttle_node) { + /* + * Back to being throttled! Break out and put the remaining + * tasks back onto the limbo_list to prevent running them + * unnecessarily. + */ + if (cfs_rq->throttle_count) + break; + list_del_init(&p->throttle_node); p->throttled = false; - enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); + enqueue_task_fair(rq, p, ENQUEUE_WAKEUP); } + list_splice(&throttled_tasks, &cfs_rq->throttled_limbo_list); + /* Add cfs_rq with load or one or more already running entities to the list */ if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); @@ -6187,7 +6822,7 @@ static void record_throttle_clock(struct cfs_rq *cfs_rq) static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); if (cfs_rq->throttle_count++) return 0; @@ -6209,35 +6844,48 @@ static int tg_throttle_down(struct task_group *tg, void *data) static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { - struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - int dequeue = 1; + struct sched_entity *curr = cfs_rq->curr; + struct rq *rq = rq_of(cfs_rq); + + scoped_guard(raw_spinlock, &cfs_b->lock) { + u64 target_runtime = 1; - raw_spin_lock(&cfs_b->lock); - /* This will start the period timer if necessary */ - if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) { /* - * We have raced with bandwidth becoming available, and if we - * actually throttled the timer might not unthrottle us for an - * entire period. We additionally needed to make sure that any - * subsequent check_cfs_rq_runtime calls agree not to throttle - * us, as we may commit to do cfs put_prev+pick_next, so we ask - * for 1ns of runtime rather than just check cfs_b. + * If cfs_rq->curr is still runnable, we are here from an + * update_curr(). Request sysctl_sched_cfs_bandwidth_slice + * worth of bandwidth to continue running. + * + * If the curr is not runnable, just request enough bandwidth + * to be runnable next time the pick selects this cfs_rq. + */ + if (curr && curr->on_rq) + target_runtime = sched_cfs_bandwidth_slice(); + + /* + * Check if We have raced with bandwidth becoming available. If + * we actually throttled the timer might not unthrottle us for + * an entire period. We additionally needed to make sure that + * any subsequent check_cfs_rq_runtime calls agree not to + * throttle us, as we may commit to do cfs put_prev+pick_next, + * so we ask for 1ns of runtime rather than just check cfs_b. + * + * This will start the period timer if necessary. + */ + if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, target_runtime)) + return false; + + /* + * No bandwidth available; Add ourselves on the list to be + * unthrottled later. */ - dequeue = 0; - } else { list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); } - raw_spin_unlock(&cfs_b->lock); - - if (!dequeue) - return false; /* Throttle no longer required. */ /* freeze hierarchy runnable averages while throttled */ - rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); - rcu_read_unlock(); + scoped_guard(rcu) + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); /* * Note: distribution will already see us throttled via the @@ -6245,6 +6893,17 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) */ cfs_rq->throttled = 1; WARN_ON_ONCE(cfs_rq->throttled_clock); + + /* + * If current hierarchy was throttled, add throttle work to the + * current donor. In case of proxy-execution, the execution + * context cannot exit to the userspace while holding a mutex + * and the rule of throttle deferral to only throttle the + * throttled context at exit to userspace is still preserved. + */ + if (curr && curr->on_rq) + task_throttle_setup_work(rq->donor); + return true; } @@ -6252,7 +6911,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + struct sched_entity *se = cfs_rq_se(cfs_rq); /* * It's possible we are called with runtime_remaining < 0 due to things @@ -6262,21 +6921,25 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) * We can't unthrottle this cfs_rq without any runtime remaining because * any enqueue in tg_unthrottle_up() will immediately trigger a throttle, * which is not supposed to happen on unthrottle path. + * + * Catch up on the remaining runtime since last clock update before + * checking runtime remaining. */ + update_curr(cfs_rq); if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) return; cfs_rq->throttled = 0; - update_rq_clock(rq); + scoped_guard(raw_spinlock, &cfs_b->lock) { + list_del_rcu(&cfs_rq->throttled_list); + + if (!cfs_rq->throttled_clock) + break; - raw_spin_lock(&cfs_b->lock); - if (cfs_rq->throttled_clock) { cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; cfs_rq->throttled_clock = 0; } - list_del_rcu(&cfs_rq->throttled_list); - raw_spin_unlock(&cfs_b->lock); /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); @@ -6305,9 +6968,8 @@ static void __cfsb_csd_unthrottle(void *arg) { struct cfs_rq *cursor, *tmp; struct rq *rq = arg; - struct rq_flags rf; - rq_lock(rq, &rf); + guard(rq_lock)(rq); /* * Iterating over the list can trigger several call to @@ -6324,7 +6986,7 @@ static void __cfsb_csd_unthrottle(void *arg) * race with group being freed in the window between removing it * from the list and advancing to the next entry in the list. */ - rcu_read_lock(); + guard(rcu)(); list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, throttled_csd_list) { @@ -6334,10 +6996,7 @@ static void __cfsb_csd_unthrottle(void *arg) unthrottle_cfs_rq(cursor); } - rcu_read_unlock(); - rq_clock_stop_loop_update(rq); - rq_unlock(rq, &rf); } static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) @@ -6346,6 +7005,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) bool first; if (rq == this_rq()) { + update_rq_clock(rq); unthrottle_cfs_rq(cfs_rq); return; } @@ -6373,15 +7033,14 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { + bool throttled = false, unthrottle_local = false; int this_cpu = smp_processor_id(); u64 runtime, remaining = 1; - bool throttled = false; - struct cfs_rq *cfs_rq, *tmp; - struct rq_flags rf; + struct cfs_rq *cfs_rq; struct rq *rq; - LIST_HEAD(local_unthrottle); - rcu_read_lock(); + guard(rcu)(); + list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { rq = rq_of(cfs_rq); @@ -6391,64 +7050,66 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) break; } - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); + if (!cfs_rq_throttled(cfs_rq)) - goto next; + continue; /* Already queued for async unthrottle */ if (!list_empty(&cfs_rq->throttled_csd_list)) - goto next; + continue; + + if (cfs_rq->curr) { + update_rq_clock(rq); + update_curr(cfs_rq); + } /* By the above checks, this should never be true */ WARN_ON_ONCE(cfs_rq->runtime_remaining > 0); - raw_spin_lock(&cfs_b->lock); - runtime = -cfs_rq->runtime_remaining + 1; - if (runtime > cfs_b->runtime) - runtime = cfs_b->runtime; - cfs_b->runtime -= runtime; - remaining = cfs_b->runtime; - raw_spin_unlock(&cfs_b->lock); + scoped_guard(raw_spinlock, &cfs_b->lock) { + runtime = -cfs_rq->runtime_remaining + 1; + if (runtime > cfs_b->runtime) + runtime = cfs_b->runtime; + cfs_b->runtime -= runtime; + remaining = cfs_b->runtime; + } cfs_rq->runtime_remaining += runtime; - /* we check whether we're throttled above */ - if (cfs_rq->runtime_remaining > 0) { - if (cpu_of(rq) != this_cpu) { - unthrottle_cfs_rq_async(cfs_rq); - } else { - /* - * We currently only expect to be unthrottling - * a single cfs_rq locally. - */ - WARN_ON_ONCE(!list_empty(&local_unthrottle)); - list_add_tail(&cfs_rq->throttled_csd_list, - &local_unthrottle); - } - } else { + /* + * Ran out of bandwidth during distribution! + * Indicate throttled entities and break early. + */ + if (cfs_rq->runtime_remaining <= 0) { throttled = true; + break; } -next: - rq_unlock_irqrestore(rq, &rf); - } - - list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle, - throttled_csd_list) { - struct rq *rq = rq_of(cfs_rq); - - rq_lock_irqsave(rq, &rf); - - list_del_init(&cfs_rq->throttled_csd_list); - - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); + /* we check whether we're throttled above */ + if (cpu_of(rq) != this_cpu) { + unthrottle_cfs_rq_async(cfs_rq); + continue; + } - rq_unlock_irqrestore(rq, &rf); + /* + * Allow a parallel async unthrottle to unthrottle + * this cfs_rq too via __cfsb_csd_unthrottle(). + * If we are first, do it ourselves at the end and + * save on an IPI from remote CPUs. + */ + unthrottle_local = list_empty(&rq->cfsb_csd_list); + list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); } - WARN_ON_ONCE(!list_empty(&local_unthrottle)); - rcu_read_unlock(); + if (unthrottle_local) { + /* + * Protect against an IPI that is also trying to flush + * the unthrottled cfs_rq(s) from this CPU's csd_list. + */ + scoped_guard(irqsave) + __cfsb_csd_unthrottle(cpu_rq(this_cpu)); + } return throttled; } @@ -6572,7 +7233,8 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (slack_runtime <= 0) return; - raw_spin_lock(&cfs_b->lock); + guard(raw_spinlock)(&cfs_b->lock); + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; @@ -6581,7 +7243,6 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) !list_empty(&cfs_b->throttled_cfs_rq)) start_cfs_slack_bandwidth(cfs_b); } - raw_spin_unlock(&cfs_b->lock); /* even if it's not valid for return we don't want to try again */ cfs_rq->runtime_remaining -= slack_runtime; @@ -6604,25 +7265,21 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) */ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { - u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - unsigned long flags; - /* confirm we're still not at a refresh boundary */ - raw_spin_lock_irqsave(&cfs_b->lock, flags); - cfs_b->slack_started = false; + scoped_guard(raw_spinlock_irqsave, &cfs_b->lock) { + u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); - return; - } + cfs_b->slack_started = false; - if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) - runtime = cfs_b->runtime; + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) + return; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) + runtime = cfs_b->runtime; - if (!runtime) - return; + if (!runtime) + return; + } distribute_cfs_runtime(cfs_b); } @@ -6637,7 +7294,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - /* an active group must be handled by the update_curr()->put() path */ + /* an active group must be handled by the update_curr() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -6647,8 +7304,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) /* update runtime allocation */ account_cfs_rq_runtime(cfs_rq, 0); - if (cfs_rq->runtime_remaining <= 0) - throttle_cfs_rq(cfs_rq); } static void sync_throttle(struct task_group *tg, int cpu) @@ -6661,8 +7316,8 @@ static void sync_throttle(struct task_group *tg, int cpu) if (!tg->parent) return; - cfs_rq = tg->cfs_rq[cpu]; - pcfs_rq = tg->parent->cfs_rq[cpu]; + cfs_rq = tg_cfs_rq(tg, cpu); + pcfs_rq = tg_cfs_rq(tg->parent, cpu); cfs_rq->throttle_count = pcfs_rq->throttle_count; cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); @@ -6678,25 +7333,6 @@ static void sync_throttle(struct task_group *tg, int cpu) cfs_rq->pelt_clock_throttled = 1; } -/* conditionally throttle active cfs_rq's from put_prev_entity() */ -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - if (!cfs_bandwidth_used()) - return false; - - if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) - return false; - - /* - * it's possible for a throttled entity to be forced into a running - * state (e.g. set_curr_task), in this case we're finished. - */ - if (cfs_rq_throttled(cfs_rq)) - return true; - - return throttle_cfs_rq(cfs_rq); -} - static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = @@ -6711,18 +7347,18 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); - unsigned long flags; int overrun; int idle = 0; int count = 0; - raw_spin_lock_irqsave(&cfs_b->lock, flags); + CLASS(raw_spinlock_irqsave, cfsb_guard)(&cfs_b->lock); + for (;;) { overrun = hrtimer_forward_now(timer, cfs_b->period); if (!overrun) break; - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); + idle = do_sched_cfs_period_timer(cfs_b, overrun, cfsb_guard.flags); if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); @@ -6755,11 +7391,13 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) count = 0; } } - if (idle) + + if (idle) { cfs_b->period_active = 0; - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); + return HRTIMER_NORESTART; + } - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; + return HRTIMER_RESTART; } void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) @@ -6826,14 +7464,12 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) */ for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - unsigned long flags; if (list_empty(&rq->cfsb_csd_list)) continue; - local_irq_save(flags); - __cfsb_csd_unthrottle(rq); - local_irq_restore(flags); + scoped_guard(irqsave) + __cfsb_csd_unthrottle(rq); } } @@ -6851,16 +7487,15 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) lockdep_assert_rq_held(rq); - rcu_read_lock(); + guard(rcu)(); + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); - raw_spin_lock(&cfs_b->lock); - cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; - raw_spin_unlock(&cfs_b->lock); + scoped_guard(raw_spinlock, &cfs_b->lock) + cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; } - rcu_read_unlock(); } /* cpu offline callback */ @@ -6881,9 +7516,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) */ rq_clock_start_loop_update(rq); - rcu_read_lock(); + guard(rcu)(); + list_for_each_entry_rcu(tg, &task_groups, list) { - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); if (!cfs_rq->runtime_enabled) continue; @@ -6904,7 +7540,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) cfs_rq->runtime_remaining = 1; unthrottle_cfs_rq(cfs_rq); } - rcu_read_unlock(); rq_clock_stop_loop_update(rq); } @@ -6951,8 +7586,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) #else /* !CONFIG_CFS_BANDWIDTH: */ -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } +static bool account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -7409,7 +8043,6 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!p->se.sched_delayed) util_est_dequeue(&rq->cfs, p); - util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); if (dequeue_entities(rq, &p->se, flags) < 0) return false; @@ -7782,7 +8415,6 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p) return -1; } -#ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -7790,7 +8422,7 @@ static inline void set_idle_cores(int cpu, int val) { struct sched_domain_shared *sds; - sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu)); if (sds) WRITE_ONCE(sds->has_idle_cores, val); } @@ -7799,7 +8431,7 @@ static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; - sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu)); if (sds) return READ_ONCE(sds->has_idle_cores); @@ -7808,7 +8440,7 @@ static inline bool test_idle_cores(int cpu) /* * Scans the local SMT mask to see if the entire core is idle, and records this - * information in sd_llc_shared->has_idle_cores. + * information in sd_balance_shared->has_idle_cores. * * Since SMT siblings share all cache levels, inspecting this limited remote * state should be fairly cheap. @@ -7838,7 +8470,8 @@ unlock: /* * Scan the entire LLC domain for idle cores; this dynamically switches off if * there are no idle cores left in the system; tracked through - * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. + * sd_balance_shared->has_idle_cores and enabled through update_idle_core() + * above. */ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { @@ -7892,29 +8525,6 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; } -#else /* !CONFIG_SCHED_SMT: */ - -static inline void set_idle_cores(int cpu, int val) -{ -} - -static inline bool test_idle_cores(int cpu) -{ - return false; -} - -static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) -{ - return __select_idle_cpu(core, p); -} - -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - return -1; -} - -#endif /* !CONFIG_SCHED_SMT */ - /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by * comparing the average scan cost (tracked in sd->avg_scan_cost) against the @@ -7925,7 +8535,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; - if (sched_feat(SIS_UTIL)) { + if (sched_feat(SIS_UTIL) && sd->shared) { /* * Increment because !--nr is the condition to stop scan. * @@ -7990,6 +8600,54 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } /* + * Idle-capacity scan converts util_fits_cpu() outcomes into preference ranks, + * where lower values indicate a better fit - see select_idle_capacity(). + * + * A CPU that both fits the task and sits on a fully-idle SMT core is returned + * immediately and is never assigned one of these ranks. On !SMT every CPU is + * its own "core", so the early return covers all fits-and-idle cases and the + * core-tier ranks below become unreachable. + * + * Rank Val Tier Meaning + * ------------------------------ --- ------ --------------------------- + * ASYM_IDLE_UCLAMP_MISFIT -4 core Idle core; capacity fits + * util but uclamp_min misses. + * ASYM_IDLE_COMPLETE_MISFIT -3 core Idle core; capacity does + * not fit. Still beats every + * thread-tier rank: a busy + * sibling cuts effective + * capacity more than a + * misfit hurts a quiet core. + * ASYM_IDLE_THREAD_FITS -2 thread Busy SMT sibling; capacity + * fits util + uclamp. + * ASYM_IDLE_THREAD_UCLAMP_MISFIT -1 thread Busy SMT sibling; capacity + * fits but uclamp_min misses + * (native util_fits_cpu() + * return value). + * ASYM_IDLE_THREAD_MISFIT 0 thread Busy SMT sibling; capacity + * does not fit. + * + * ASYM_IDLE_CORE_BIAS (-3) is an offset, not a state. On an idle core, + * fits += ASYM_IDLE_CORE_BIAS rebases thread-tier ranks into the core tier: + * + * ASYM_IDLE_THREAD_UCLAMP_MISFIT (-1) + BIAS -> ASYM_IDLE_UCLAMP_MISFIT (-4) + * ASYM_IDLE_THREAD_MISFIT (0) + BIAS -> ASYM_IDLE_COMPLETE_MISFIT (-3) + * + * ASYM_IDLE_THREAD_FITS (-2) is never rebased because a fully-fitting idle-core + * candidate early-returns from select_idle_capacity(). + */ +enum asym_fits_state { + ASYM_IDLE_UCLAMP_MISFIT = -4, + ASYM_IDLE_COMPLETE_MISFIT, + ASYM_IDLE_THREAD_FITS, + ASYM_IDLE_THREAD_UCLAMP_MISFIT, + ASYM_IDLE_THREAD_MISFIT, + + /* util_fits_cpu() bias for idle core */ + ASYM_IDLE_CORE_BIAS = -3, +}; + +/* * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which * the task fits. If no CPU is big enough, but there are idle ones, try to * maximize capacity. @@ -7997,10 +8655,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { + /* + * On !SMT systems, has_idle_core is always false and preferred_core + * is always true (CPU == core), so the SMT preference logic below + * collapses to the plain capacity scan. + */ + bool has_idle_core = sched_smt_active() && test_idle_cores(target); unsigned long task_util, util_min, util_max, best_cap = 0; - int fits, best_fits = 0; + int fits, best_fits = ASYM_IDLE_THREAD_MISFIT; int cpu, best_cpu = -1; struct cpumask *cpus; + int nr = INT_MAX; cpus = this_cpu_cpumask_var_ptr(select_rq_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); @@ -8009,16 +8674,41 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) util_min = uclamp_eff_value(p, UCLAMP_MIN); util_max = uclamp_eff_value(p, UCLAMP_MAX); + if (sched_feat(SIS_UTIL) && sd->shared) { + /* + * Same nr_idle_scan hint as select_idle_cpu(), nr only limits + * the scan when not preferring an idle core. + */ + nr = READ_ONCE(sd->shared->nr_idle_scan) + 1; + /* overloaded domain is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; + } + for_each_cpu_wrap(cpu, cpus, target) { + bool preferred_core = !has_idle_core || is_core_idle(cpu); unsigned long cpu_cap = capacity_of(cpu); + /* + * Stop when the nr_idle_scan is exhausted (mirrors + * select_idle_cpu() logic). + */ + if (!has_idle_core && --nr <= 0) + return best_cpu; + if (!choose_idle_cpu(cpu, p)) continue; fits = util_fits_cpu(task_util, util_min, util_max, cpu); - /* This CPU fits with all requirements */ - if (fits > 0) + /* + * Perfect fit: capacity satisfies util + uclamp and the CPU + * sits on a fully-idle SMT core, this is a !SMT system, or + * there is no idle core to find. + * Short-circuit the rank-based selection and return + * immediately. + */ + if (fits > 0 && preferred_core) return cpu; /* * Only the min performance hint (i.e. uclamp_min) doesn't fit. @@ -8026,9 +8716,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) */ else if (fits < 0) cpu_cap = get_actual_cpu_capacity(cpu); + /* + * fits > 0 implies we are not on a preferred core, but the util + * fits CPU capacity. Set fits to ASYM_IDLE_THREAD_FITS + * so the effective range becomes + * [ASYM_IDLE_THREAD_FITS, ASYM_IDLE_THREAD_MISFIT], where: + * ASYM_IDLE_THREAD_MISFIT - does not fit + * ASYM_IDLE_THREAD_UCLAMP_MISFIT - fits with the exception of UCLAMP_MIN + * ASYM_IDLE_THREAD_FITS - fits with the exception of preferred_core + */ + else if (fits > 0) + fits = ASYM_IDLE_THREAD_FITS; /* - * First, select CPU which fits better (-1 being better than 0). + * If we are on a preferred core, translate the range of fits + * of [ASYM_IDLE_THREAD_UCLAMP_MISFIT, ASYM_IDLE_THREAD_MISFIT] to + * [ASYM_IDLE_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT]. + * This ensures that an idle core is always given priority over + * (partially) busy core. + * + * A fully fitting idle core would have returned early and hence + * fits > 0 for preferred_core need not be dealt with. + */ + if (preferred_core) + fits += ASYM_IDLE_CORE_BIAS; + + /* + * First, select CPU which fits better (lower is more preferred). * Then, select the one with best capacity at same level. */ if ((fits < best_fits) || @@ -8039,6 +8753,19 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) } } + /* + * A value in the [ASYM_IDLE_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT] + * range means the chosen CPU is in a fully idle SMT core. Values above + * ASYM_IDLE_COMPLETE_MISFIT mean we never ranked such a CPU best. + * + * The asym-capacity wakeup path returns from select_idle_sibling() + * after this function and never runs select_idle_cpu(), so the usual + * select_idle_cpu() tail that clears idle cores must live here when the + * idle-core preference did not win. + */ + if (has_idle_core && best_fits > ASYM_IDLE_COMPLETE_MISFIT) + set_idle_cores(target, false); + return best_cpu; } @@ -8047,12 +8774,22 @@ static inline bool asym_fits_cpu(unsigned long util, unsigned long util_max, int cpu) { - if (sched_asym_cpucap_active()) + if (sched_asym_cpucap_active()) { /* * Return true only if the cpu fully fits the task requirements * which include the utilization and the performance hints. + * + * When SMT is active, also require that the core has no busy + * siblings. + * + * Note: gating on is_core_idle() also makes the early-bailout + * candidates in select_idle_sibling() (target, prev, + * recent_used_cpu) idle-core-aware on ASYM+SMT, which the + * NO_ASYM path does not do. */ - return (util_fits_cpu(util, util_min, util_max, cpu) > 0); + return (!sched_smt_active() || is_core_idle(cpu)) && + (util_fits_cpu(util, util_min, util_max, cpu) > 0); + } return true; } @@ -8231,25 +8968,32 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) static unsigned long cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) { + bool add_task = p && task_cpu(p) != cpu && dst_cpu == cpu; + bool sub_task = p && task_cpu(p) == cpu && dst_cpu != cpu; struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); unsigned long runnable; - if (boost) { - runnable = READ_ONCE(cfs_rq->avg.runnable_avg); - util = max(util, runnable); - } - /* * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its * contribution. If @p migrates from another CPU to @cpu add its * contribution. In all the other cases @cpu is not impacted by the * migration so its util_avg is already correct. */ - if (p && task_cpu(p) == cpu && dst_cpu != cpu) - lsub_positive(&util, task_util(p)); - else if (p && task_cpu(p) != cpu && dst_cpu == cpu) + if (add_task) util += task_util(p); + else if (sub_task) + lsub_positive(&util, task_util(p)); + + if (boost) { + runnable = READ_ONCE(cfs_rq->avg.runnable_avg); + if (add_task) + runnable += READ_ONCE(p->se.avg.runnable_avg); + else if (sub_task) + lsub_positive(&runnable, + READ_ONCE(p->se.avg.runnable_avg)); + util = max(util, runnable); + } if (sched_feat(UTIL_EST)) { unsigned long util_est; @@ -9145,9 +9889,10 @@ pick: /* * Because p is enqueued, nse being null can only mean that we - * dequeued a delayed task. + * dequeued a delayed task. If there are still entities queued in + * cfs, check if the next one will be p. */ - if (!nse) + if (!nse && cfs_rq->nr_queued) goto pick; if (sched_feat(RUN_TO_PARITY)) @@ -9164,17 +9909,19 @@ preempt: resched_curr_lazy(rq); } -static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) +struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) + __must_hold(__rq_lockp(rq)) { struct sched_entity *se; struct cfs_rq *cfs_rq; struct task_struct *p; bool throttled; + int new_tasks; again: cfs_rq = &rq->cfs; if (!cfs_rq->nr_queued) - return NULL; + goto idle; throttled = false; @@ -9183,8 +9930,6 @@ again: if (cfs_rq->curr && cfs_rq->curr->on_rq) update_curr(cfs_rq); - throttled |= check_cfs_rq_runtime(cfs_rq); - se = pick_next_entity(rq, cfs_rq, true); if (!se) goto again; @@ -9195,95 +9940,22 @@ again: if (unlikely(throttled)) task_throttle_setup_work(p); return p; -} - -static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); - -struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - __must_hold(__rq_lockp(rq)) -{ - struct sched_entity *se; - struct task_struct *p; - int new_tasks; - -again: - p = pick_task_fair(rq, rf); - if (!p) - goto idle; - se = &p->se; - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class != &fair_sched_class) - goto simple; - - __put_prev_set_next_dl_server(rq, prev, p); - - /* - * Because of the set_next_buddy() in dequeue_task_fair() it is rather - * likely that a next task is from the same cgroup as the current. - * - * Therefore attempt to avoid putting and setting the entire cgroup - * hierarchy, only change the part that actually changes. - * - * Since we haven't yet done put_prev_entity and if the selected task - * is a different task than we started out with, try and touch the - * least amount of cfs_rqs. - */ - if (prev != p) { - struct sched_entity *pse = &prev->se; - struct cfs_rq *cfs_rq; - - while (!(cfs_rq = is_same_group(se, pse))) { - int se_depth = se->depth; - int pse_depth = pse->depth; - - if (se_depth <= pse_depth) { - put_prev_entity(cfs_rq_of(pse), pse); - pse = parent_entity(pse); - } - if (se_depth >= pse_depth) { - set_next_entity(cfs_rq_of(se), se, true); - se = parent_entity(se); - } - } - - put_prev_entity(cfs_rq, pse); - set_next_entity(cfs_rq, se, true); - - __set_next_task_fair(rq, p, true); - } - - return p; - -simple: -#endif /* CONFIG_FAIR_GROUP_SCHED */ - put_prev_set_next_task(rq, prev, p); - return p; idle: - if (rf) { - new_tasks = sched_balance_newidle(rq, rf); - - /* - * Because sched_balance_newidle() releases (and re-acquires) - * rq->lock, it is possible for any higher priority task to - * appear. In that case we must re-start the pick_next_entity() - * loop. - */ - if (new_tasks < 0) - return RETRY_TASK; - - if (new_tasks > 0) - goto again; - } + if (sched_core_enabled(rq)) + return NULL; + new_tasks = sched_balance_newidle(rq, rf); + if (new_tasks < 0) + return RETRY_TASK; + if (new_tasks > 0) + goto again; return NULL; } static struct task_struct * fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) + __must_hold(__rq_lockp(dl_se->rq)) { return pick_task_fair(dl_se->rq, rf); } @@ -9304,10 +9976,33 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; + struct sched_entity *nse = NULL; - for_each_sched_entity(se) { +#ifdef CONFIG_FAIR_GROUP_SCHED + if (next && next->sched_class == &fair_sched_class) + nse = &next->se; +#endif + + while (se) { cfs_rq = cfs_rq_of(se); - put_prev_entity(cfs_rq, se); + if (!nse || cfs_rq->curr) + put_prev_entity(cfs_rq, se); +#ifdef CONFIG_FAIR_GROUP_SCHED + if (nse) { + if (is_same_group(se, nse)) + break; + + int d = nse->depth - se->depth; + if (d >= 0) { + /* nse has equal or greater depth, ascend */ + nse = parent_entity(nse); + /* if nse is the deeper, do not ascend se */ + if (d > 0) + continue; + } + } +#endif + se = parent_entity(se); } } @@ -9529,6 +10224,16 @@ enum group_type { */ group_imbalanced, /* + * There are tasks running on non-preferred LLC, possible to move + * them to their preferred LLC without creating too much imbalance. + * The priority of group_llc_balance is lower than that of + * group_overloaded and higher than that of all other group types. + * This is because group_llc_balance may exacerbate load imbalance. + * If the LLC balancing attempt fails, the nr_balance_failed + * mechanism will trigger other group types to rebalance the load. + */ + group_llc_balance, + /* * The CPU is overloaded and can't provide expected CPU cycles to all * tasks. */ @@ -9539,7 +10244,8 @@ enum migration_type { migrate_load = 0, migrate_util, migrate_task, - migrate_misfit + migrate_misfit, + migrate_llc_task }; #define LBF_ALL_PINNED 0x01 @@ -9547,6 +10253,7 @@ enum migration_type { #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 #define LBF_ACTIVE_LB 0x10 +#define LBF_LLC_PINNED 0x20 struct lb_env { struct sched_domain *sd; @@ -9556,6 +10263,7 @@ struct lb_env { int dst_cpu; struct rq *dst_rq; + bool dst_core_idle; struct cpumask *dst_grpmask; int new_dst_cpu; @@ -9692,7 +10400,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ struct cfs_rq *dst_cfs_rq; #ifdef CONFIG_FAIR_GROUP_SCHED - dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu]; + dst_cfs_rq = tg_cfs_rq(task_group(p), dest_cpu); #else dst_cfs_rq = &cpu_rq(dest_cpu)->cfs; #endif @@ -9703,6 +10411,298 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ return 0; } +#ifdef CONFIG_SCHED_CACHE +/* + * The margin used when comparing LLC utilization with CPU capacity. + * It determines the LLC load level where active LLC aggregation is + * done. + * Derived from fits_capacity(). + * + * (default: ~50%, tunable via debugfs) + */ +static bool fits_llc_capacity(unsigned long util, unsigned long max) +{ + u32 aggr_pct = llc_overaggr_pct; + + /* + * For single core systems, raise the aggregation + * threshold to accommodate more tasks. + */ + if (cpu_smt_num_threads == 1) + aggr_pct = (aggr_pct * 3 / 2); + + return util * 100 < max * aggr_pct; +} + +/* + * The margin used when comparing utilization. + * is 'util1' noticeably greater than 'util2' + * Derived from capacity_greater(). + * Bias is in perentage. + */ +/* Allows dst util to be bigger than src util by up to bias percent */ +#define util_greater(util1, util2) \ + ((util1) * 100 > (util2) * (100 + llc_imb_pct)) + +static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +{ + struct sched_domain_shared *sd_share; + + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + if (!sd_share) + return false; + + *util = READ_ONCE(sd_share->util_avg); + *cap = READ_ONCE(sd_share->capacity); + + return true; +} + +/* + * Decision matrix according to the LLC utilization. To + * decide whether we can do task aggregation across LLC. + * + * By default, 50% is the threshold for treating the LLC + * as busy. The reason for choosing 50% is to avoid saturation + * of SMT-2, and it is also a safe cutoff for other SMT-n + * platforms. SMT-1 has higher threshold because it is + * supposed to accommodate more tasks, see fits_llc_capacity(). + * + * 20% is the utilization imbalance percentage to decide + * if the preferred LLC is busier than the non-preferred LLC. + * 20 is a little higher than the LLC domain's imbalance_pct + * 17. The hysteresis is used to avoid task bouncing between the + * preferred LLC and the non-preferred LLC, and it will + * be turned into tunable debugfs. + * + * 1. moving towards the preferred LLC, dst is the preferred + * LLC, src is not. + * + * src \ dst 30% 40% 50% 60% + * 30% Y Y Y N + * 40% Y Y Y Y + * 50% Y Y G G + * 60% Y Y G G + * + * 2. moving out of the preferred LLC, src is the preferred + * LLC, dst is not: + * + * src \ dst 30% 40% 50% 60% + * 30% N N N N + * 40% N N N N + * 50% N N G G + * 60% Y N G G + * + * src : src_util + * dst : dst_util + * Y : Yes, migrate + * N : No, do not migrate + * G : let the Generic load balance to even the load. + * + * The intention is that if both LLCs are quite busy, cache aware + * load balance should not be performed, and generic load balance + * should take effect. However, if one is busy and the other is not, + * the preferred LLC capacity(50%) and imbalance criteria(20%) should + * be considered to determine whether LLC aggregation should be + * performed to bias the load towards the preferred LLC. + */ + +/* migration decision, 3 states are orthogonal. */ +enum llc_mig { + mig_forbid = 0, /* N: Don't migrate task, respect LLC preference */ + mig_llc, /* Y: Do LLC preference based migration */ + mig_unrestricted /* G: Don't restrict generic load balance migration */ +}; + +/* + * Check if task can be moved from the source LLC to the + * destination LLC without breaking cache aware preferrence. + * src_cpu and dst_cpu are arbitrary CPUs within the source + * and destination LLCs, respectively. + */ +static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu, + unsigned long tsk_util, + bool to_pref) +{ + unsigned long src_util, dst_util, src_cap, dst_cap; + + if (!get_llc_stats(src_cpu, &src_util, &src_cap) || + !get_llc_stats(dst_cpu, &dst_util, &dst_cap)) + return mig_unrestricted; + + src_util = src_util < tsk_util ? 0 : src_util - tsk_util; + dst_util = dst_util + tsk_util; + + if (!fits_llc_capacity(dst_util, dst_cap) && + !fits_llc_capacity(src_util, src_cap)) + return mig_unrestricted; + + if (to_pref) { + /* + * Don't migrate if we will get preferred LLC too + * heavily loaded and if the dest is much busier + * than the src, in which case migration will + * increase the imbalance too much. + */ + if (!fits_llc_capacity(dst_util, dst_cap) && + util_greater(dst_util, src_util)) + return mig_forbid; + } else { + /* + * Don't migrate if we will leave preferred LLC + * too idle, or if this migration leads to the + * non-preferred LLC falls within sysctl_aggr_imb percent + * of preferred LLC, leading to migration again + * back to preferred LLC. + */ + if (fits_llc_capacity(src_util, src_cap) || + !util_greater(src_util, dst_util)) + return mig_forbid; + } + return mig_llc; +} + +/* + * Check if task p can migrate from source LLC to + * destination LLC in terms of cache aware load balance. + */ +static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, + struct task_struct *p) +{ + struct mm_struct *mm; + bool to_pref; + int cpu; + + mm = p->mm; + if (!mm) + return mig_unrestricted; + + cpu = READ_ONCE(mm->sc_stat.cpu); + if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) + return mig_unrestricted; + + /* skip cache aware load balance for too many threads */ + if (invalid_llc_nr(mm, p, dst_cpu) || + exceed_llc_capacity(mm, dst_cpu)) { + if (READ_ONCE(mm->sc_stat.cpu) != -1) + WRITE_ONCE(mm->sc_stat.cpu, -1); + return mig_unrestricted; + } + + if (cpus_share_cache(dst_cpu, cpu)) + to_pref = true; + else if (cpus_share_cache(src_cpu, cpu)) + to_pref = false; + else + return mig_unrestricted; + + return can_migrate_llc(src_cpu, dst_cpu, + task_util(p), to_pref); +} + +/* + * Check if active load balance breaks LLC locality in + * terms of cache aware load balance. The load level and + * imbalance do not warrant breaking LLC preference per + * the can_migrate_llc() policy. Here, the benefit of + * LLC locality outweighs the power efficiency gained from + * migrating the only runnable task away. + */ +static inline bool +alb_break_llc(struct lb_env *env) +{ + if (!sched_cache_enabled()) + return false; + + if (cpus_share_cache(env->src_cpu, env->dst_cpu)) + return false; + /* + * All tasks prefer to stay on their current CPU. + * Do not pull a task from its preferred CPU if: + * 1. It is the only task running and does not exceed + * imbalance allowance; OR + * 2. Migrating it away from its preferred LLC would violate + * the cache-aware scheduling policy. + */ + if (env->src_rq->nr_pref_llc_running && + env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable) { + unsigned long util = 0; + struct task_struct *cur; + + if (env->src_rq->nr_running <= 1) + return true; + + cur = rcu_dereference_all(env->src_rq->curr); + if (cur && cur->sched_class == &fair_sched_class) + util = task_util(cur); + + if (can_migrate_llc(env->src_cpu, env->dst_cpu, + util, false) == mig_forbid) + return true; + } + + return false; +} + +/* + * Check if migrating task p from env->src_cpu to + * env->dst_cpu breaks LLC localiy. + */ +static bool migrate_degrades_llc(struct task_struct *p, struct lb_env *env) +{ + if (!sched_cache_enabled()) + return false; + + if (task_has_sched_core(p)) + return false; + /* + * Skip over tasks that would degrade LLC locality; + * only when nr_balanced_failed is sufficiently high do we + * ignore this constraint. + * + * Threshold of cache_nice_tries is set to 1 higher + * than nr_balance_failed to avoid excessive task + * migration at the same time. + */ + if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1) + return false; + + /* + * We know the env->src_cpu has some tasks prefer to + * run on env->dst_cpu, skip the tasks do not prefer + * env->dst_cpu, and find the one that prefers. + */ + if (env->migration_type == migrate_llc_task && + READ_ONCE(p->preferred_llc) != llc_id(env->dst_cpu)) + return true; + + if (can_migrate_llc_task(env->src_cpu, + env->dst_cpu, p) != mig_forbid) + return false; + + return true; +} + +#else +static inline bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +{ + return false; +} + +static inline bool +alb_break_llc(struct lb_env *env) +{ + return false; +} + +static inline bool +migrate_degrades_llc(struct task_struct *p, struct lb_env *env) +{ + return false; +} +#endif /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -9799,10 +10799,29 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 1; degrades = migrate_degrades_locality(p, env); - if (!degrades) + if (!degrades) { + /* + * If the NUMA locality is not broken, + * further check if migration would hurt + * LLC locality. + */ + if (migrate_degrades_llc(p, env)) { + /* + * If regular load balancing fails to pull a task + * due to LLC locality, this is expected behavior + * and we set LBF_LLC_PINNED so we don't increase + * nr_balance_failed unecessarily. + */ + if (env->migration_type != migrate_llc_task) + env->flags |= LBF_LLC_PINNED; + + return 0; + } + hot = task_hot(p, env); - else + } else { hot = degrades > 0; + } if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (hot) @@ -9964,6 +10983,10 @@ static int detach_tasks(struct lb_env *env) env->imbalance = 0; break; + + case migrate_llc_task: + env->imbalance--; + break; } detach_task(p, env); @@ -10097,7 +11120,6 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq, *pos; bool decayed = false; - int cpu = cpu_of(rq); /* * Iterates the task_group tree in a bottom up fashion, see @@ -10117,7 +11139,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) } /* Propagate pending load changes to the parent, if any: */ - se = cfs_rq->tg->se[cpu]; + se = cfs_rq_se(cfs_rq); if (se && !skip_blocked_update(se)) update_load_avg(cfs_rq_of(se), se, UPDATE_TG); @@ -10143,8 +11165,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) */ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) { - struct rq *rq = rq_of(cfs_rq); - struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + struct sched_entity *se = cfs_rq_se(cfs_rq); unsigned long now = jiffies; unsigned long load; @@ -10242,12 +11263,16 @@ struct sg_lb_stats { enum group_type group_type; unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned int group_smt_balance; /* Task on busy SMT be moved */ + unsigned int group_llc_balance; /* Tasks should be moved to preferred LLC */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ unsigned int group_overutilized; /* At least one CPU is overutilized in the group */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; #endif +#ifdef CONFIG_SCHED_CACHE + unsigned int nr_pref_dst_llc; +#endif }; /* @@ -10505,6 +11530,9 @@ group_type group_classify(unsigned int imbalance_pct, if (group_is_overloaded(imbalance_pct, sgs)) return group_overloaded; + if (sgs->group_llc_balance) + return group_llc_balance; + if (sg_imbalanced(group)) return group_imbalanced; @@ -10659,6 +11687,105 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) return check_cpu_capacity(rq, sd); } +#ifdef CONFIG_SCHED_CACHE +/* + * Record the statistics for this scheduler group for later + * use. These values guide load balancing on aggregating tasks + * to a LLC. + */ +static void record_sg_llc_stats(struct lb_env *env, + struct sg_lb_stats *sgs, + struct sched_group *group) +{ + struct sched_domain_shared *sd_share; + int cpu; + + if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE) + return; + + /* Only care about sched domain spanning multiple LLCs */ + if (env->sd->child != rcu_dereference_all(per_cpu(sd_llc, env->dst_cpu))) + return; + + /* + * At this point we know this group spans a LLC domain. + * Record the statistic of this group in its corresponding + * shared LLC domain. + * Note: sd_share cannot be obtained via sd->child->shared, + * because the latter refers to the domain that covers the + * local group. Instead, sd_share should be located using + * the first CPU of the LLC group. + */ + cpu = cpumask_first(sched_group_span(group)); + sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + if (!sd_share) + return; + + if (READ_ONCE(sd_share->util_avg) != sgs->group_util) + WRITE_ONCE(sd_share->util_avg, sgs->group_util); + + if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity)) + WRITE_ONCE(sd_share->capacity, sgs->group_capacity); +} + +/* + * Do LLC balance on sched group that contains LLC, and have tasks preferring + * to run on LLC in idle dst_cpu. + */ +static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + if (!sched_cache_enabled()) + return false; + + if (env->sd->flags & SD_SHARE_LLC) + return false; + + /* + * Skip cache aware tagging if nr_balanced_failed is sufficiently high. + * Threshold of cache_nice_tries is set to 1 higher than nr_balance_failed + * to avoid excessive task migration at the same time. + */ + if (env->sd->nr_balance_failed >= env->sd->cache_nice_tries + 1) + return false; + + if (sgs->nr_pref_dst_llc && + can_migrate_llc(cpumask_first(sched_group_span(group)), + env->dst_cpu, 0, true) == mig_llc) + return true; + + return false; +} + +static bool update_llc_busiest(struct lb_env *env, + struct sg_lb_stats *busiest, + struct sg_lb_stats *sgs) +{ + /* + * There are more tasks that want to run on dst_cpu's LLC. + */ + return sgs->nr_pref_dst_llc > busiest->nr_pref_dst_llc; +} +#else +static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ +} + +static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + return false; +} + +static bool update_llc_busiest(struct lb_env *env, + struct sg_lb_stats *busiest, + struct sg_lb_stats *sgs) +{ + return false; +} +#endif + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -10695,6 +11822,20 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (cpu_overutilized(i)) sgs->group_overutilized = 1; +#ifdef CONFIG_SCHED_CACHE + if (sched_cache_enabled()) { + struct sched_domain *sd_tmp; + int dst_llc; + + dst_llc = llc_id(env->dst_cpu); + if (llc_id(i) != dst_llc) { + sd_tmp = rcu_dereference_all(rq->sd); + if (sd_tmp && (unsigned int)dst_llc < sd_tmp->llc_max) + sgs->nr_pref_dst_llc += sd_tmp->llc_counts[dst_llc]; + } + } +#endif + /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -10735,17 +11876,24 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; - /* Check if dst CPU is idle and preferred to this group */ - if (!local_group && env->idle && sgs->sum_h_nr_running && - sched_group_asym(env, sgs, group)) - sgs->group_asym_packing = 1; + if (!local_group) { + /* Check if dst CPU is idle and preferred to this group */ + if (env->idle && sgs->sum_h_nr_running && + sched_group_asym(env, sgs, group)) + sgs->group_asym_packing = 1; + + /* Check for loaded SMT group to be balanced to dst CPU */ + if (smt_balance(env, sgs, group)) + sgs->group_smt_balance = 1; - /* Check for loaded SMT group to be balanced to dst CPU */ - if (!local_group && smt_balance(env, sgs, group)) - sgs->group_smt_balance = 1; + /* Check for tasks in this group can be moved to their preferred LLC */ + if (llc_balance(env, sgs, group)) + sgs->group_llc_balance = 1; + } sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + record_sg_llc_stats(env, sgs, group); /* Computing avg_load makes sense only when group is overloaded */ if (sgs->group_type == group_overloaded) sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / @@ -10781,10 +11929,16 @@ static bool update_sd_pick_busiest(struct lb_env *env, * We can use max_capacity here as reduction in capacity on some * CPUs in the group should either be possible to resolve * internally or be covered by avg_load imbalance (eventually). + * + * When SMT is active, only pull a misfit to dst_cpu if it is on a + * fully idle core; otherwise the effective capacity of the core is + * reduced and we may not actually provide more capacity than the + * source. */ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && (sgs->group_type == group_misfit_task) && - (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || + (!env->dst_core_idle || + !capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || sds->local_stat.group_type != group_has_spare)) return false; @@ -10804,6 +11958,10 @@ static bool update_sd_pick_busiest(struct lb_env *env, /* Select the overloaded group with highest avg_load. */ return sgs->avg_load > busiest->avg_load; + case group_llc_balance: + /* Select the group with most tasks preferring dst LLC */ + return update_llc_busiest(env, busiest, sgs); + case group_imbalanced: /* * Select the 1st imbalanced group as we don't have any way to @@ -11066,6 +12224,7 @@ static bool update_pick_idlest(struct sched_group *idlest, return false; break; + case group_llc_balance: case group_imbalanced: case group_asym_packing: case group_smt_balance: @@ -11198,6 +12357,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int return NULL; break; + case group_llc_balance: case group_imbalanced: case group_asym_packing: case group_smt_balance: @@ -11348,6 +12508,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd unsigned long sum_util = 0; bool sg_overloaded = 0, sg_overutilized = 0; + env->dst_core_idle = !sched_smt_active() || is_core_idle(env->dst_cpu); + do { struct sg_lb_stats *sgs = &tmp_sgs; int local_group; @@ -11450,6 +12612,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return; } +#ifdef CONFIG_SCHED_CACHE + if (busiest->group_type == group_llc_balance) { + /* Move a task that prefer local LLC */ + env->migration_type = migrate_llc_task; + env->imbalance = 1; + return; + } +#endif + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages @@ -11696,7 +12867,8 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) * group's child domain. */ if (sds.prefer_sibling && local->group_type == group_has_spare && - sibling_imbalance(env, &sds, busiest, local) > 1) + (busiest->group_type == group_llc_balance || + sibling_imbalance(env, &sds, busiest, local) > 1)) goto force_balance; if (busiest->group_type != group_overloaded) { @@ -11755,7 +12927,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, { struct rq *busiest = NULL, *rq; unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int __maybe_unused busiest_pref_llc = 0; + struct sched_domain __maybe_unused *sd_tmp; unsigned int busiest_nr = 0; + int __maybe_unused dst_llc; int i; for_each_cpu_and(i, sched_group_span(group), env->cpus) { @@ -11883,6 +13058,23 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, break; + case migrate_llc_task: +#ifdef CONFIG_SCHED_CACHE + sd_tmp = rcu_dereference_all(rq->sd); + dst_llc = llc_id(env->dst_cpu); + + if (sd_tmp && (unsigned)dst_llc < sd_tmp->llc_max) { + unsigned int this_pref_llc = + sd_tmp->llc_counts[dst_llc]; + + if (busiest_pref_llc < this_pref_llc) { + busiest_pref_llc = this_pref_llc; + busiest = rq; + } + } +#endif + break; + } } @@ -11934,6 +13126,9 @@ static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; + if (alb_break_llc(env)) + return 0; + if (asym_active_balance(env)) return 1; @@ -11953,7 +13148,8 @@ static int need_active_balance(struct lb_env *env) return 1; } - if (env->migration_type == migrate_misfit) + if (env->migration_type == migrate_misfit || + env->migration_type == migrate_llc_task) return 1; return 0; @@ -11998,7 +13194,9 @@ static int should_we_balance(struct lb_env *env) * balancing cores, but remember the first idle SMT CPU for * later consideration. Find CPU on an idle core first. */ - if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { + if (sched_smt_active() && + !(env->sd->flags & SD_SHARE_CPUCAPACITY) && + !is_core_idle(cpu)) { if (idle_smt == -1) idle_smt = cpu; /* @@ -12006,9 +13204,7 @@ static int should_we_balance(struct lb_env *env) * idle has been found, then its not needed to check other * SMT siblings for idleness: */ -#ifdef CONFIG_SCHED_SMT cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu)); -#endif continue; } @@ -12046,6 +13242,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd case migrate_misfit: __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); break; + case migrate_llc_task: + break; } } @@ -12249,9 +13447,16 @@ more_balance: * * Similarly for migration_misfit which is not related to * load/util migration, don't pollute nr_balance_failed. + * + * The same for cache aware scheduling's allowance for + * load imbalance. If regular load balance does not + * migrate task due to LLC locality, it is a expected + * behavior and don't pollute nr_balance_failed. + * See can_migrate_task(). */ if (idle != CPU_NEWLY_IDLE && - env.migration_type != migrate_misfit) + env.migration_type != migrate_misfit && + !(env.flags & LBF_LLC_PINNED)) sd->nr_balance_failed++; if (need_active_balance(&env)) { @@ -12755,8 +13960,6 @@ static void nohz_balancer_kick(struct rq *rq) goto out; } - rcu_read_lock(); - sd = rcu_dereference_all(rq->sd); if (sd) { /* @@ -12764,8 +13967,8 @@ static void nohz_balancer_kick(struct rq *rq) * capacity, kick the ILB to see if there's a better CPU to run on: */ if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) { - flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; - goto unlock; + flags |= NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; + goto out; } } @@ -12781,8 +13984,8 @@ static void nohz_balancer_kick(struct rq *rq) */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym(sd, i, cpu)) { - flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; - goto unlock; + flags |= NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; + goto out; } } } @@ -12793,10 +13996,8 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU * to run the misfit task on. */ - if (check_misfit_status(rq)) { - flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; - goto unlock; - } + if (check_misfit_status(rq)) + flags |= NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; /* * For asymmetric systems, we do not want to nicely balance @@ -12805,10 +14006,10 @@ static void nohz_balancer_kick(struct rq *rq) * * Skip the LLC logic because it's not relevant in that case. */ - goto unlock; + goto out; } - sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu)); + sds = rcu_dereference_all(per_cpu(sd_balance_shared, cpu)); if (sds) { /* * If there is an imbalance between LLC domains (IOW we could @@ -12820,13 +14021,9 @@ static void nohz_balancer_kick(struct rq *rq) * like this LLC domain has tasks we could move. */ nr_busy = atomic_read(&sds->nr_busy_cpus); - if (nr_busy > 1) { - flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; - goto unlock; - } + if (nr_busy > 1) + flags |= NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; } -unlock: - rcu_read_unlock(); out: if (READ_ONCE(nohz.needs_update)) flags |= NOHZ_NEXT_KICK; @@ -12838,17 +14035,17 @@ out: static void set_cpu_sd_state_busy(int cpu) { struct sched_domain *sd; - - rcu_read_lock(); sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); - if (!sd || !sd->nohz_idle) - goto unlock; + /* + * sd->nohz_idle only pairs with nr_busy_cpus on sd->shared; if this + * domain has no shared object there is nothing to clear or account. + */ + if (!sd || !sd->shared || !sd->nohz_idle) + return; sd->nohz_idle = 0; atomic_inc(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); } void nohz_balance_exit_idle(struct rq *rq) @@ -12867,17 +14064,14 @@ void nohz_balance_exit_idle(struct rq *rq) static void set_cpu_sd_state_idle(int cpu) { struct sched_domain *sd; - - rcu_read_lock(); sd = rcu_dereference_all(per_cpu(sd_llc, cpu)); - if (!sd || sd->nohz_idle) - goto unlock; + /* See set_cpu_sd_state_busy(): nohz_idle is only used with sd->shared. */ + if (!sd || !sd->shared || sd->nohz_idle) + return; sd->nohz_idle = 1; atomic_dec(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); } /* @@ -13636,7 +14830,7 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) struct cfs_rq *cfs_rq; #ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq = task_group(p)->cfs_rq[cpu]; + cfs_rq = tg_cfs_rq(task_group(p), cpu); #else cfs_rq = &cpu_rq(cpu)->cfs; #endif @@ -13656,8 +14850,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} */ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { - struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; + struct cfs_rq *cfs_rq; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); @@ -13670,6 +14864,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + task_tick_cache(rq, curr); + update_misfit_status(curr, rq); check_update_overutilized_status(task_rq(curr)); @@ -13828,9 +15024,33 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) } } -static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) +/* + * Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { struct sched_entity *se = &p->se; + bool throttled = false; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) && + first && cfs_rq->curr) + break; + + set_next_entity(cfs_rq, se, first); + /* ensure bandwidth has been allocated on our new cfs_rq */ + throttled |= account_cfs_rq_runtime(cfs_rq, 0); + } + + if (throttled) + task_throttle_setup_work(p); + + se = &p->se; if (task_on_rq_queued(p)) { /* @@ -13851,27 +15071,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs sched_fair_update_stop_tick(rq, p); } -/* - * Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) -{ - struct sched_entity *se = &p->se; - - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - set_next_entity(cfs_rq, se, first); - /* ensure bandwidth has been allocated on our new cfs_rq */ - account_cfs_rq_runtime(cfs_rq, 0); - } - - __set_next_task_fair(rq, p, first); -} - void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; @@ -13899,56 +15098,38 @@ static void task_change_group_fair(struct task_struct *p) void free_fair_sched_group(struct task_group *tg) { - int i; - - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); - } - - kfree(tg->cfs_rq); - kfree(tg->se); + free_percpu(tg->cfs_rq); } int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { + struct cfs_tg_state __percpu *state; struct sched_entity *se; struct cfs_rq *cfs_rq; int i; - tg->cfs_rq = kzalloc_objs(cfs_rq, nr_cpu_ids); - if (!tg->cfs_rq) - goto err; - tg->se = kzalloc_objs(se, nr_cpu_ids); - if (!tg->se) + state = alloc_percpu_gfp(struct cfs_tg_state, GFP_KERNEL); + if (!state) goto err; + tg->cfs_rq = &state->cfs_rq; tg->shares = NICE_0_LOAD; init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); for_each_possible_cpu(i) { - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL, cpu_to_node(i)); + cfs_rq = tg_cfs_rq(tg, i); if (!cfs_rq) goto err; - se = kzalloc_node(sizeof(struct sched_entity_stats), - GFP_KERNEL, cpu_to_node(i)); - if (!se) - goto err_free_rq; - + se = tg_se(tg, i); init_cfs_rq(cfs_rq); - init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + init_tg_cfs_entry(tg, cfs_rq, se, i, tg_se(parent, i)); init_entity_runnable_average(se); } return 1; -err_free_rq: - kfree(cfs_rq); err: return 0; } @@ -13962,7 +15143,7 @@ void online_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { rq = cpu_rq(i); - se = tg->se[i]; + se = tg_se(tg, i); rq_lock_irq(rq, &rf); update_rq_clock(rq); attach_entity_cfs_rq(se); @@ -13978,8 +15159,8 @@ void unregister_fair_sched_group(struct task_group *tg) destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(cpu) { - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; - struct sched_entity *se = tg->se[cpu]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu); + struct sched_entity *se = tg_se(tg, cpu); struct rq *rq = cpu_rq(cpu); if (se) { @@ -14015,9 +15196,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, cfs_rq->rq = rq; init_cfs_rq_runtime(cfs_rq); - tg->cfs_rq[cpu] = cfs_rq; - tg->se[cpu] = se; - /* se could be NULL for root_task_group */ if (!se) return; @@ -14047,7 +15225,7 @@ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) /* * We can't change the weight of the root cgroup. */ - if (!tg->se[0]) + if (is_root_task_group(tg)) return -EINVAL; shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); @@ -14058,7 +15236,7 @@ static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) tg->shares = shares; for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - struct sched_entity *se = tg->se[i]; + struct sched_entity *se = tg_se(tg, i); struct rq_flags rf; /* Propagate contribution to hierarchy */ @@ -14109,8 +15287,8 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - struct sched_entity *se = tg->se[i]; - struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + struct sched_entity *se = tg_se(tg, i); + struct cfs_rq *grp_cfs_rq = tg_cfs_rq(tg, i); bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf; @@ -14183,7 +15361,6 @@ DEFINE_SCHED_CLASS(fair) = { .wakeup_preempt = wakeup_preempt_fair, .pick_task = pick_task_fair, - .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 84c4fe3abd74..8f0dee8fc475 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -110,8 +110,16 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) * rq lock and possibly create a large contention, sending an * IPI to that CPU and let that CPU push the RT task to where * it should go may be a better scenario. + * + * This is best for PREEMPT_RT, but for non-RT it can cause issues + * when preemption is disabled for long periods of time. Have + * it only default enabled for PREEMPT_RT. */ +# ifdef CONFIG_PREEMPT_RT SCHED_FEAT(RT_PUSH_IPI, true) +# else +SCHED_FEAT(RT_PUSH_IPI, false) +# endif #endif SCHED_FEAT(RT_RUNTIME_SHARE, false) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index a83be0c834dd..052435f4d3e3 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -280,6 +280,14 @@ static void do_idle(void) int cpu = smp_processor_id(); bool got_tick = false; + if (cpu_is_offline(cpu)) { + local_irq_disable(); + /* All per-CPU kernel threads should be done by now. */ + WARN_ON_ONCE(need_resched()); + cpuhp_report_idle_dead(); + arch_cpu_idle_dead(); + } + /* * Check if we need to update blocked load */ @@ -331,11 +339,6 @@ static void do_idle(void) */ local_irq_disable(); - if (cpu_is_offline(cpu)) { - cpuhp_report_idle_dead(); - arch_cpu_idle_dead(); - } - arch_cpu_idle_enter(); rcu_nocb_flush_deferred_wakeup(); @@ -462,7 +465,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int flags) } static int -balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +balance_idle(struct rq *rq, struct rq_flags *rf) { return WARN_ON_ONCE(1); } diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 623445603725..cb957b8f1946 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -164,8 +164,26 @@ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ | MEMBARRIER_CMD_GET_REGISTRATIONS) +/* + * Scoped guard for memory barriers on entry and exit. + * Matches memory barriers before & after rq->curr modification in scheduler. + */ +DEFINE_LOCK_GUARD_0(mb, smp_mb(), smp_mb()) static DEFINE_MUTEX(membarrier_ipi_mutex); +static DEFINE_PER_CPU(struct mutex, membarrier_cpu_mutexes); + #define SERIALIZE_IPI() guard(mutex)(&membarrier_ipi_mutex) +#define SERIALIZE_IPI_CPU(cpu_id) guard(mutex)(&per_cpu(membarrier_cpu_mutexes, cpu_id)) + +static int __init membarrier_init(void) +{ + int i; + + for_each_possible_cpu(i) + mutex_init(&per_cpu(membarrier_cpu_mutexes, i)); + return 0; +} +core_initcall(membarrier_init); static void ipi_mb(void *info) { @@ -199,7 +217,16 @@ static void ipi_rseq(void *info) * is negligible. */ smp_mb(); - rseq_sched_switch_event(current); + /* + * Legacy mode requires that IDs are written and the critical section is + * evaluated. V2 optimized mode handles the critical section and IDs are + * only updated if they change as a consequence of preemption after + * return from this IPI. + */ + if (rseq_v2(current)) + rseq_sched_switch_event(current); + else + rseq_force_update(); } static void ipi_sync_rq_state(void *info) @@ -249,23 +276,19 @@ void membarrier_update_current_mm(struct mm_struct *next_mm) static int membarrier_global_expedited(void) { + cpumask_var_t __free(free_cpumask_var) tmpmask = CPUMASK_VAR_NULL; int cpu; - cpumask_var_t tmpmask; if (num_online_cpus() == 1) return 0; - /* - * Matches memory barriers after rq->curr modification in - * scheduler. - */ - smp_mb(); /* system call entry is not a mb. */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; + guard(mb)(); SERIALIZE_IPI(); - cpus_read_lock(); + guard(cpus_read_lock)(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -301,21 +324,11 @@ static int membarrier_global_expedited(void) smp_call_function_many(tmpmask, ipi_mb, NULL, 1); preempt_enable(); - free_cpumask_var(tmpmask); - cpus_read_unlock(); - - /* - * Memory barrier on the caller thread _after_ we finished - * waiting for the last IPI. Matches memory barriers before - * rq->curr modification in scheduler. - */ - smp_mb(); /* exit from system call is not a mb */ return 0; } static int membarrier_private_expedited(int flags, int cpu_id) { - cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; smp_call_func_t ipi_func = ipi_mb; @@ -352,30 +365,45 @@ static int membarrier_private_expedited(int flags, int cpu_id) * On RISC-V, this barrier pairing is also needed for the * SYNC_CORE command when switching between processes, cf. * the inline comments in membarrier_arch_switch_mm(). + * + * Memory barrier on the caller thread _after_ we finished + * waiting for the last IPI. Matches memory barriers before + * rq->curr modification in scheduler. */ - smp_mb(); /* system call entry is not a mb. */ - - if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - - SERIALIZE_IPI(); - cpus_read_lock(); - + guard(mb)(); if (cpu_id >= 0) { + if (cpu_id >= nr_cpu_ids || !cpu_possible(cpu_id)) + return 0; + + SERIALIZE_IPI_CPU(cpu_id); + guard(cpus_read_lock)(); struct task_struct *p; - if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) - goto out; + if (!cpu_online(cpu_id)) + return 0; + rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu_id)->curr); if (!p || p->mm != mm) { rcu_read_unlock(); - goto out; + return 0; } rcu_read_unlock(); + /* + * smp_call_function_single() will call ipi_func() if cpu_id + * is the calling CPU. + */ + smp_call_function_single(cpu_id, ipi_func, NULL, 1); } else { + cpumask_var_t __free(free_cpumask_var) tmpmask = CPUMASK_VAR_NULL; int cpu; + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + SERIALIZE_IPI(); + guard(cpus_read_lock)(); + rcu_read_lock(); for_each_online_cpu(cpu) { struct task_struct *p; @@ -385,15 +413,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) __cpumask_set_cpu(cpu, tmpmask); } rcu_read_unlock(); - } - - if (cpu_id >= 0) { - /* - * smp_call_function_single() will call ipi_func() if cpu_id - * is the calling CPU. - */ - smp_call_function_single(cpu_id, ipi_func, NULL, 1); - } else { /* * For regular membarrier, we can save a few cycles by * skipping the current cpu -- we're about to do smp_mb() @@ -420,18 +439,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) } } -out: - if (cpu_id < 0) - free_cpumask_var(tmpmask); - cpus_read_unlock(); - - /* - * Memory barrier on the caller thread _after_ we finished - * waiting for the last IPI. Matches memory barriers before - * rq->curr modification in scheduler. - */ - smp_mb(); /* exit from system call is not a mb */ - return 0; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4ee8faf01441..e474c31d8fe6 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -19,9 +19,9 @@ int sysctl_sched_rt_period = 1000000; /* * part of the period that we allow rt tasks to run in us. - * default: 0.95s + * default: 1s */ -int sysctl_sched_rt_runtime = 950000; +int sysctl_sched_rt_runtime = 1000000; #ifdef CONFIG_SYSCTL static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ; @@ -1596,8 +1596,14 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) resched_curr(rq); } -static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +static int balance_rt(struct rq *rq, struct rq_flags *rf) { + /* + * Note, rq->donor may change during rq lock drops, + * so don't re-use p across lock drops + */ + struct task_struct *p = rq->donor; + if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { /* * This is OK, because current is on_cpu, which avoids it being diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9f63b15d309d..c7c2dea65edd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -421,6 +421,10 @@ extern void ext_server_init(struct rq *rq); extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); extern int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init); +extern int dl_server_attach_bw(struct sched_dl_entity *dl_se); +extern void dl_server_detach_bw(struct sched_dl_entity *dl_se); +extern int dl_server_swap_bw(struct sched_dl_entity *detach_se, + struct sched_dl_entity *attach_se); static inline bool dl_server_active(struct sched_dl_entity *dl_se) { @@ -480,10 +484,8 @@ struct task_group { #endif #ifdef CONFIG_FAIR_GROUP_SCHED - /* schedulable entities of this group on each CPU */ - struct sched_entity **se; /* runqueue "owned" by this group on each CPU */ - struct cfs_rq **cfs_rq; + struct cfs_rq __percpu *cfs_rq; unsigned long shares; /* * load_avg can be heavily contended at clock tick time, so put @@ -889,6 +891,7 @@ struct dl_rq { bool overloaded; + struct sched_dl_entity *curr; /* * Tasks on this rq that can be pushed away. They are kept in * an rb-tree, ordered by tasks' deadlines, with caching @@ -929,7 +932,8 @@ struct dl_rq { }; #ifdef CONFIG_FAIR_GROUP_SCHED - +/* Check whether a task group is root tg */ +#define is_root_task_group(tg) ((tg) == &root_task_group) /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) @@ -1187,6 +1191,12 @@ struct rq { struct scx_rq scx; struct sched_dl_entity ext_server; #endif +#ifdef CONFIG_SCHED_CACHE + raw_spinlock_t cpu_epoch_lock ____cacheline_aligned; + u64 cpu_runtime; + unsigned long cpu_epoch; + unsigned long cpu_epoch_next; +#endif struct sched_dl_entity fair_server; @@ -1199,6 +1209,12 @@ struct rq { #ifdef CONFIG_NUMA_BALANCING unsigned int numa_migrate_on; #endif + +#ifdef CONFIG_SCHED_CACHE + unsigned int nr_pref_llc_running; + unsigned int nr_llc_running; +#endif + /* * This is part of a global counter where only the total sum * over all CPUs matters. A task can increase this counter on @@ -1546,6 +1562,14 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); extern void sched_core_get(void); extern void sched_core_put(void); +static inline bool task_has_sched_core(struct task_struct *p) +{ + if (sched_core_disabled()) + return false; + + return !!p->core_cookie; +} + #else /* !CONFIG_SCHED_CORE: */ static inline bool sched_core_enabled(struct rq *rq) @@ -1586,6 +1610,11 @@ static inline bool sched_group_cookie_match(struct rq *rq, return true; } +static inline bool task_has_sched_core(struct task_struct *p) +{ + return false; +} + #endif /* !CONFIG_SCHED_CORE */ #ifdef CONFIG_RT_GROUP_SCHED @@ -1667,21 +1696,15 @@ do { \ flags = _raw_spin_rq_lock_irqsave(rq); \ } while (0) -#ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); static inline void update_idle_core(struct rq *rq) { - if (static_branch_unlikely(&sched_smt_present)) + if (sched_smt_active()) __update_idle_core(rq); } -#else /* !CONFIG_SCHED_SMT: */ -static inline void update_idle_core(struct rq *rq) { } -#endif /* !CONFIG_SCHED_SMT */ - #ifdef CONFIG_FAIR_GROUP_SCHED - static inline struct task_struct *task_of(struct sched_entity *se) { WARN_ON_ONCE(!entity_is_task(se)); @@ -2082,6 +2105,8 @@ init_numa_balancing(u64 clone_flags, struct task_struct *p) #endif /* !CONFIG_NUMA_BALANCING */ +int task_llc(const struct task_struct *p); + static inline void queue_balance_callback(struct rq *rq, struct balance_callback *head, @@ -2171,6 +2196,7 @@ DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(int, sd_share_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); @@ -2267,6 +2293,46 @@ static inline struct task_group *task_group(struct task_struct *p) return p->sched_task_group; } +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Defined here to be available before stats.h is included, since + * stats.h has dependencies on things defined later in this file. + */ +struct cfs_tg_state { + struct cfs_rq cfs_rq; + struct sched_entity se; + struct sched_statistics stats; +} __no_randomize_layout; + +/* Access a specific CPU's cfs_rq from a task group */ +static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu) +{ + return per_cpu_ptr(tg->cfs_rq, cpu); +} + +static inline struct sched_entity *tg_se(struct task_group *tg, int cpu) +{ + struct cfs_tg_state *state; + + if (is_root_task_group(tg)) + return NULL; + + state = container_of(tg_cfs_rq(tg, cpu), struct cfs_tg_state, cfs_rq); + return &state->se; +} + +static inline struct sched_entity *cfs_rq_se(struct cfs_rq *cfs_rq) +{ + struct cfs_tg_state *state; + + if (is_root_task_group(cfs_rq->tg)) + return NULL; + + state = container_of(cfs_rq, struct cfs_tg_state, cfs_rq); + return &state->se; +} +#endif + /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { @@ -2275,10 +2341,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_FAIR_GROUP_SCHED - set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); - p->se.cfs_rq = tg->cfs_rq[cpu]; - p->se.parent = tg->se[cpu]; - p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0; + set_task_rq_fair(&p->se, p->se.cfs_rq, tg_cfs_rq(tg, cpu)); + p->se.cfs_rq = tg_cfs_rq(tg, cpu); + p->se.parent = tg_se(tg, cpu); + p->se.depth = p->se.parent ? p->se.parent->depth + 1 : 0; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -2561,23 +2627,12 @@ struct sched_class { /* * schedule/pick_next_task/prev_balance: rq->lock */ - int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + int (*balance)(struct rq *rq, struct rq_flags *rf); /* * schedule/pick_next_task: rq->lock */ struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); - /* - * Optional! When implemented pick_next_task() should be equivalent to: - * - * next = pick_task(); - * if (next) { - * put_prev_task(prev); - * set_next_task_first(next); - * } - */ - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf); /* * sched_change: @@ -2801,8 +2856,7 @@ static inline bool sched_fair_runnable(struct rq *rq) return rq->cfs.nr_queued > 0; } -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf); +extern struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf); extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf); #define SCA_CHECK 0x01 @@ -4037,6 +4091,29 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_SCHED_MM_CID */ +#ifdef CONFIG_SCHED_CACHE +DECLARE_STATIC_KEY_FALSE(sched_cache_present); +DECLARE_STATIC_KEY_FALSE(sched_cache_active); +extern int sysctl_sched_cache_user; +extern unsigned int llc_aggr_tolerance; +extern unsigned int llc_epoch_period; +extern unsigned int llc_epoch_affinity_timeout; +extern unsigned int llc_imb_pct; +extern unsigned int llc_overaggr_pct; + +static inline bool sched_cache_enabled(void) +{ + return static_branch_unlikely(&sched_cache_active); +} + +extern void sched_cache_active_set(void); + +#endif + +void sched_domains_free_llc_id(int cpu); + +extern void init_sched_mm(struct task_struct *p); + extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); static inline diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index a612cf253c87..ebe0a7765f98 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -89,19 +89,12 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt #endif /* CONFIG_SCHEDSTATS */ -#ifdef CONFIG_FAIR_GROUP_SCHED -struct sched_entity_stats { - struct sched_entity se; - struct sched_statistics stats; -} __no_randomize_layout; -#endif - static inline struct sched_statistics * __schedstats_from_se(struct sched_entity *se) { #ifdef CONFIG_FAIR_GROUP_SCHED if (!entity_is_task(se)) - return &container_of(se, struct sched_entity_stats, se)->stats; + return &container_of(se, struct cfs_tg_state, se)->stats; #endif return &task_of(se)->stats; } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index f95798baddeb..c909ca0d8c87 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -16,7 +16,7 @@ select_task_rq_stop(struct task_struct *p, int cpu, int flags) } static int -balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +balance_stop(struct rq *rq, struct rq_flags *rf) { return sched_stop_runnable(rq); } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5847b83d9d55..622e2e01974c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -19,8 +19,10 @@ void sched_domains_mutex_unlock(void) } /* Protected by sched_domains_mutex: */ +static cpumask_var_t sched_domains_llc_id_allocmask; static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; +int max_lid; static int __init sched_debug_setup(char *str) { @@ -621,6 +623,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) } while (sg != first); } +static void free_sched_domain_shared(struct sched_domain_shared *sds) +{ + if (sds && atomic_dec_and_test(&sds->ref)) + kfree(sds); +} + static void destroy_sched_domain(struct sched_domain *sd) { /* @@ -629,9 +637,12 @@ static void destroy_sched_domain(struct sched_domain *sd) * dropping group/capacity references, freeing where none remain. */ free_sched_groups(sd->groups, 1); + free_sched_domain_shared(sd->shared); - if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) - kfree(sd->shared); +#ifdef CONFIG_SCHED_CACHE + /* only the bottom sd has llc_counts array */ + kfree(sd->llc_counts); +#endif kfree(sd); } @@ -663,9 +674,10 @@ static void destroy_sched_domains(struct sched_domain *sd) */ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); -DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_llc_id) = -1; DEFINE_PER_CPU(int, sd_share_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); @@ -692,7 +704,6 @@ static void update_top_cache_domain(int cpu) rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; - per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_CLUSTER); @@ -713,7 +724,18 @@ static void update_top_cache_domain(int cpu) rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); + /* + * The shared object is attached to sd_asym_cpucapacity only when the + * asym domain is non-overlapping (i.e., not built from SD_NUMA). + * On overlapping (NUMA) asym domains we fall back to letting the + * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL + * here. + */ + if (sd && sd->shared) + sds = sd->shared; + rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); + rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds); } /* @@ -737,7 +759,14 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* Pick reference to parent->shared. */ if (parent->shared) { - WARN_ON_ONCE(tmp->shared); + /* + * It is safe to free a sd->shared that + * has not been published yet. If a + * sd->shared was published, the refcount + * will end up being non-zero and it will + * not be freed here. + */ + free_sched_domain_shared(tmp->shared); tmp->shared = parent->shared; parent->shared = NULL; } @@ -762,10 +791,20 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd && sd_degenerate(sd)) { tmp = sd; sd = sd->parent; - destroy_sched_domain(tmp); + if (sd) { struct sched_group *sg = sd->groups; +#ifdef CONFIG_SCHED_CACHE + /* move buffer to parent as child is being destroyed */ + sd->llc_counts = tmp->llc_counts; + sd->llc_max = tmp->llc_max; + sd->llc_bytes = tmp->llc_bytes; + /* make sure destroy_sched_domain() does not free it */ + tmp->llc_counts = NULL; + tmp->llc_max = 0; + tmp->llc_bytes = 0; +#endif /* * sched groups hold the flags of the child sched * domain for convenience. Clear such flags since @@ -777,6 +816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } + + destroy_sched_domain(tmp); } sched_domain_debug(sd, cpu); @@ -804,6 +845,239 @@ enum s_alloc { sa_none, }; +#ifdef CONFIG_SCHED_CACHE +/* hardware support for cache aware scheduling */ +DEFINE_STATIC_KEY_FALSE(sched_cache_present); +/* + * Indicator of whether cache aware scheduling + * is active, used by the scheduler. + */ +DEFINE_STATIC_KEY_FALSE(sched_cache_active); +/* user wants cache aware scheduling [0 or 1] */ +int sysctl_sched_cache_user = 1; + +/* + * Get the effective LLC size in bytes that @cpu's bottom sched_domain + * can use. A CPU within a cpuset partition can only use a proportion + * of the physical LLC, scaled by the ratio of the partition's span + * weight to the hardware LLC sharing weight. @sd should be the + * topmost domain with SD_SHARE_LLC. + * + * Returns 0 if cacheinfo is not yet populated. This happens during + * early boot when build_sched_domains() runs before the generic + * cacheinfo framework has been initialized (cacheinfo_cpu_online() + * is a device_initcall cpuhp callback). In that case, + * cacheinfo_cpu_online() will later call sched_update_llc_bytes() + * to fill in the bottom domain's llc_bytes once the cache attributes + * are available. + */ +static unsigned long get_effective_llc_bytes(int cpu, + struct sched_domain *sd) +{ + struct cacheinfo *ci; + unsigned int hw_weight; + + ci = get_cpu_cacheinfo_llc(cpu); + if (!ci) + return 0; + + hw_weight = cpumask_weight(&ci->shared_cpu_map); + if (!hw_weight) + return 0; + + return div_u64((u64)ci->size * sd->span_weight, hw_weight); +} + +static bool alloc_sd_llc(const struct cpumask *cpu_map, + struct s_data *d) +{ + struct sched_domain *sd, *top_llc, *parent; + unsigned int *p; + int i; + + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d->sd, i); + if (!sd) + goto err; + + p = kcalloc_node(max_lid + 1, sizeof(unsigned int), + GFP_KERNEL, cpu_to_node(i)); + if (!p) + goto err; + + top_llc = sd; + /* + * Find the topmost SD_SHARE_LLC domain. + * Not yet attached to the CPU, so per_cpu(sd_llc, i) + * can not be used. + */ + while ((parent = rcu_dereference_protected(top_llc->parent, true)) && + (parent->flags & SD_SHARE_LLC)) + top_llc = parent; + + if (top_llc->flags & SD_SHARE_LLC) { + sd->llc_max = max_lid + 1; + sd->llc_counts = p; + sd->llc_bytes = get_effective_llc_bytes(i, top_llc); + } else { + /* avoid memory leak */ + kfree(p); + } + } + + return true; +err: + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d->sd, i); + if (sd) { + kfree(sd->llc_counts); + sd->llc_counts = NULL; + sd->llc_max = 0; + sd->llc_bytes = 0; + } + } + + return false; +} + +/* + * Enable/disable cache aware scheduling according to + * user input and the presence of hardware support. + */ +static void _sched_cache_active_set(void) +{ + lockdep_assert_cpus_held(); + lockdep_assert_held(&sched_domains_mutex); + + /* hardware does not support */ + if (!static_branch_likely(&sched_cache_present)) { + static_branch_disable_cpuslocked(&sched_cache_active); + if (sched_debug()) + pr_info("%s: cache aware scheduling not supported on this platform\n", __func__); + return; + } + + /* + * user wants it or not ? + * TBD: read before writing the static key. + * It is not in the critical path, leave as-is + * for now. + */ + if (sysctl_sched_cache_user) { + static_branch_enable_cpuslocked(&sched_cache_active); + if (sched_debug()) + pr_info("%s: enabling cache aware scheduling\n", __func__); + } else { + static_branch_disable_cpuslocked(&sched_cache_active); + if (sched_debug()) + pr_info("%s: disabling cache aware scheduling\n", __func__); + } +} + +/* used by debugfs */ +void sched_cache_active_set(void) +{ + cpus_read_lock(); + sched_domains_mutex_lock(); + _sched_cache_active_set(); + sched_domains_mutex_unlock(); + cpus_read_unlock(); +} + +/* + * Update the bottom sched_domain's llc_bytes for @cpu and all its + * LLC siblings. Called from cacheinfo_cpu_online() or + * cacheinfo_cpu_pre_down() with cpu hotplug lock held. + * + * Note: get_effective_llc_bytes() returns 0 on PowerPC. + * thus cache aware scheduling is disabled on PowerPC for + * now. PowerPC does not use the generic cacheinfo framework -- + * it has its own cacheinfo with a separate struct cache hierarchy + * and does not populates the per-CPU struct cpu_cacheinfo array + * that get_cpu_cacheinfo_llc() reads. + */ +void sched_update_llc_bytes(unsigned int cpu) +{ + struct sched_domain *sd, *sdp; + unsigned int i; + + sched_domains_mutex_lock(); + + sdp = rcu_dereference_sched_domain(per_cpu(sd_llc, cpu)); + if (!sdp) + goto unlock; + + /* + * ci->shared_cpu_map is built incrementally as CPUs come + * online, so the first CPU in an LLC initially sees + * hw_weight == 1 and computes an inflated llc_bytes in + * get_effective_llc_bytes(). Re-evaluating every LLC + * sibling on each online event corrects this once the full + * shared_cpu_map is known. + */ + for_each_cpu(i, sched_domain_span(sdp)) { + sd = rcu_dereference_sched_domain(cpu_rq(i)->sd); + if (sd) + sd->llc_bytes = get_effective_llc_bytes(i, sdp); + } + +unlock: + sched_domains_mutex_unlock(); +} + +static void sched_cache_set(bool has_multi_llcs) +{ + /* + * TBD: check before writing to it. sched domain rebuild + * is not in the critical path, leave as-is for now. + */ + if (has_multi_llcs) + static_branch_enable_cpuslocked(&sched_cache_present); + else + static_branch_disable_cpuslocked(&sched_cache_present); + + _sched_cache_active_set(); +} +#else +static bool alloc_sd_llc(const struct cpumask *cpu_map, + struct s_data *d) +{ + return false; +} +static inline void sched_cache_set(bool has_multi_llcs) { } +#endif + +/* + * Return true if @sd belongs to an LLC group whose enclosing + * partition spans more than one LLC. @sd must be the topmost + * SD_SHARE_LLC domain. + * + * Any duplicated parent domains with the same span as @sd are + * skipped: before cpu_attach_domain() degeneration these still + * exist, after degeneration the loop is a no-op. This makes the + * helper usable both during sched domain build and against an + * already-attached domain tree. + * + * Note: For systems with a single LLC per node, cache-aware + * scheduling is still enabled when multiple nodes exist. + * However, NUMA balancing decisions take precedence over + * cache-aware scheduling. Conversely, if there is only one + * LLC per partition, cache-aware scheduling should be disabled. + */ +static bool sd_in_multi_llcs(struct sched_domain *sd) +{ + struct sched_domain *sdp = sd->parent; + + /* it does not make sense to aggregate to 1 CPU */ + if (sd->span_weight == 1) + return false; + + while (sdp && sdp->span_weight == sd->span_weight) + sdp = sdp->parent; + + return !!sdp; +} + /* * Return the canonical balance CPU for this group, this is the first CPU * of this group that's also in the balance mask. @@ -1310,9 +1584,7 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) cpumask_copy(mask, sched_group_span(sg)); for_each_cpu(cpu, mask) { cores++; -#ifdef CONFIG_SCHED_SMT cpumask_andnot(mask, mask, cpu_smt_mask(cpu)); -#endif } sg->cores = cores; @@ -1790,8 +2062,22 @@ const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu { return cpu_coregroup_mask(cpu); } + +/* + * Majority of architectures have LLC at MC domain level with exception + * such as powerpc. Provide a way for arch to specify where its LLC is + * if it falls in exception category + */ +# ifndef arch_llc_mask +#define arch_llc_mask(cpu) cpu_coregroup_mask(cpu) +# endif + +#else +#define arch_llc_mask(cpu) cpumask_of(cpu) #endif +#define llc_mask(cpu) arch_llc_mask(cpu) + const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) { return cpu_node_mask(cpu); @@ -2650,14 +2936,153 @@ static void adjust_numa_imbalance(struct sched_domain *sd_llc) } } +static void +init_sched_domain_shared(struct s_data *d, struct sched_domain *sd, int flags) +{ + struct sched_domain_shared *sds = NULL; + int cpu; + + /* + * Multiple domains can try to claim a shared object like + * SD_ASYM_CPUCAPACITY and SD_SHARE_LLC which can alias to + * same cpumask_first(sched_domain_span(sd)) CPU and can + * cause "nr_idle_scan" to be populated incorrectly during + * load balancing. + * + * Find the first CPU in sched_domain_span(sd) with an + * unclaimed domain (!alloc_flags) or where the alloc_flag + * matches the requested flag (SD_* flag) + * + * If the domain only has single CPU, allow temporary overlap + * in allocation since the domains will be degenerated later. + */ + for_each_cpu(cpu, sched_domain_span(sd)) { + sds = *per_cpu_ptr(d->sds, cpu); + + if (!sds->alloc_flags || + sd->span_weight == 1 || + sds->alloc_flags == flags) { + sds->alloc_flags = flags; + sd->shared = sds; + break; + } + } + + /* + * Use the sd_shared corresponding to the last + * CPU in the span if none are avaialable. + */ + if (WARN_ON_ONCE(!sd->shared)) + sd->shared = sds; + + /* + * nr_busy_cpus is consumed only by the NOHZ kick path via + * sd_balance_shared; on the asym-capacity path it is initialized but + * never read. + */ + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight); + atomic_inc(&sd->shared->ref); +} + +/* + * For asymmetric CPU capacity, attach sched_domain_shared on the innermost + * SD_ASYM_CPUCAPACITY_FULL ancestor of @cpu's base domain when that ancestor is + * not an overlapping NUMA-built domain (then LLC should claim shared). + * + * A CPU may lack any FULL ancestor (e.g., exclusive cpuset symmetric island), + * then LLC must claim shared instead. + * + * Note: SD_ASYM_CPUCAPACITY_FULL is only set when all CPU capacity values + * are present in the domain span, so the asym domain we attach to cannot + * degenerate into a single-capacity group. The relevant edge cases are instead + * covered by the caveats above. + * + * Return true if this CPU's asym path claimed sd->shared, false otherwise. + */ +static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu) +{ + struct sched_domain *sd = *per_cpu_ptr(d->sd, cpu); + struct sched_domain *sd_asym; + + if (!sd) + return false; + + sd_asym = sd; + while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL)) + sd_asym = sd_asym->parent; + + if (!sd_asym || (sd_asym->flags & SD_NUMA)) + return false; + + init_sched_domain_shared(d, sd_asym, SD_ASYM_CPUCAPACITY); + return true; +} + +static int __sched_domains_alloc_llc_id(void) +{ + int lid, max; + + lockdep_assert_held(&sched_domains_mutex); + + lid = cpumask_first_zero(sched_domains_llc_id_allocmask); + /* + * llc_id space should never grow larger than the + * possible number of CPUs in the system. + */ + if (lid >= nr_cpu_ids) + return -1; + + __cpumask_set_cpu(lid, sched_domains_llc_id_allocmask); + max = cpumask_last(sched_domains_llc_id_allocmask); + if (max > max_lid) + max_lid = max; + + return lid; +} + +static void __sched_domains_free_llc_id(int cpu) +{ + int i, lid, max; + + lockdep_assert_held(&sched_domains_mutex); + + lid = per_cpu(sd_llc_id, cpu); + if (lid == -1 || lid >= nr_cpu_ids) + return; + + per_cpu(sd_llc_id, cpu) = -1; + + for_each_cpu(i, llc_mask(cpu)) { + /* An online CPU owns the llc_id. */ + if (per_cpu(sd_llc_id, i) == lid) + return; + } + + __cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask); + + max = cpumask_last(sched_domains_llc_id_allocmask); + /* shrink max lid to save memory */ + if (max < max_lid) + max_lid = max; +} + +void sched_domains_free_llc_id(int cpu) +{ + sched_domains_mutex_lock(); + __sched_domains_free_llc_id(cpu); + sched_domains_mutex_unlock(); +} + /* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs */ static int -build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) +build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr, + bool *multi_llcs) { enum s_alloc alloc_state = sa_none; + bool has_multi_llcs = false; struct sched_domain *sd; struct s_data d; struct rq *rq = NULL; @@ -2675,6 +3100,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; + int lid; sd = NULL; for_each_sd_topology(tl) { @@ -2688,6 +3114,29 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } + + lid = per_cpu(sd_llc_id, i); + if (lid == -1) { + /* try to reuse the llc_id of its siblings */ + for (int j = cpumask_first(llc_mask(i)); + j < nr_cpu_ids; + j = cpumask_next(j, llc_mask(i))) { + if (i == j) + continue; + + lid = per_cpu(sd_llc_id, j); + + if (lid != -1) { + per_cpu(sd_llc_id, i) = lid; + + break; + } + } + + /* a new LLC is detected */ + if (lid == -1) + per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id(); + } } if (WARN_ON(!topology_span_sane(cpu_map))) @@ -2712,23 +3161,27 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (!sd) continue; + if (has_asym) + claim_asym_sched_domain_shared(&d, i); + /* First, find the topmost SD_SHARE_LLC domain */ while (sd->parent && (sd->parent->flags & SD_SHARE_LLC)) sd = sd->parent; if (sd->flags & SD_SHARE_LLC) { - int sd_id = cpumask_first(sched_domain_span(sd)); - - sd->shared = *per_cpu_ptr(d.sds, sd_id); - atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight); - atomic_inc(&sd->shared->ref); + init_sched_domain_shared(&d, sd, SD_SHARE_LLC); /* * In presence of higher domains, adjust the * NUMA imbalance stats for the hierarchy. */ - if (IS_ENABLED(CONFIG_NUMA) && sd->parent) - adjust_numa_imbalance(sd); + if (sd->parent) { + if (IS_ENABLED(CONFIG_NUMA)) + adjust_numa_imbalance(sd); + + if (sd_in_multi_llcs(sd)) + has_multi_llcs = true; + } } } @@ -2743,6 +3196,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att init_sched_groups_capacity(i, sd); } + alloc_sd_llc(cpu_map, &d); + /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { @@ -2767,6 +3222,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att ret = 0; error: + *multi_llcs = has_multi_llcs; __free_domain_allocs(&d, alloc_state, cpu_map); return ret; @@ -2829,8 +3285,10 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) */ int __init sched_init_domains(const struct cpumask *cpu_map) { + bool multi_llcs; int err; + zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL); zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); @@ -2842,7 +3300,9 @@ int __init sched_init_domains(const struct cpumask *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN)); - err = build_sched_domains(doms_cur[0], NULL); + err = build_sched_domains(doms_cur[0], NULL, &multi_llcs); + if (!err) + sched_cache_set(multi_llcs); return err; } @@ -2915,6 +3375,7 @@ static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; + bool has_multi_llcs = false, multi_llcs; int i, j, n; int new_topology; @@ -2964,14 +3425,41 @@ match1: for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && - dattrs_equal(dattr_new, i, dattr_cur, j)) + dattrs_equal(dattr_new, i, dattr_cur, j)) { + /* + * Reused partition has to be taken care + * of here, because there could be a corner + * case that if the reused partition is skipped + * and only new partition is considered, an + * incorrect has_multi_llcs would be set. For + * example: + * If the only multi-LLC partition is reused + * and a new single-LLC partition is built, + * sched_cache_set(false) disables cache-aware + * scheduling globally despite the reused + * multi-LLC partition still being active. + */ + struct sched_domain *sd; + int cpu = cpumask_first(doms_cur[j]); + + guard(rcu)(); + sd = rcu_dereference(cpu_rq(cpu)->sd); + while (sd && sd->parent && (sd->parent->flags & SD_SHARE_LLC)) + sd = sd->parent; + if (sd && (sd->flags & SD_SHARE_LLC) && sd->parent && + sd_in_multi_llcs(sd)) + has_multi_llcs = true; goto match2; + } } /* No match - add a new doms_new */ - build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL, + &multi_llcs); + has_multi_llcs |= multi_llcs; match2: ; } + sched_cache_set(has_multi_llcs); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) /* Build perf domains: */ diff --git a/kernel/signal.c b/kernel/signal.c index 2d102e025883..9c2b32c4d755 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1338,6 +1338,7 @@ int zap_other_threads(struct task_struct *p) int count = 0; p->signal->group_stop_count = 0; + task_clear_jobctl_pending(p, JOBCTL_PENDING_MASK); for_other_threads(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 3fe6b0c99f3d..773d8e9ae30c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -633,6 +633,11 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) EXPORT_SYMBOL_GPL(stop_machine); #ifdef CONFIG_SCHED_SMT +/* + * INTEL_IFS is the only user of this API. That selftest can + * only be compiled if SMP=y. On x86 it selects SCHED_SMT. + * Keep the ifdefs for now. + */ int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); diff --git a/kernel/sys.c b/kernel/sys.c index 62e842055cc9..df69bd71de03 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2565,14 +2565,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = put_user(me->pdeath_signal, (int __user *)arg2); break; case PR_GET_DUMPABLE: - error = get_dumpable(me->mm); + error = task_exec_state_get_dumpable(me); break; case PR_SET_DUMPABLE: - if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) { + if (arg2 != TASK_DUMPABLE_OFF && arg2 != TASK_DUMPABLE_OWNER) { error = -EINVAL; break; } - set_dumpable(me->mm, arg2); + task_exec_state_set_dumpable(arg2); break; case PR_SET_UNALIGN: diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 02aac7c5aa76..d098ac39bde4 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -16,10 +16,6 @@ config ARCH_CLOCKSOURCE_INIT config ARCH_WANTS_CLOCKSOURCE_READ_INLINE bool -# Timekeeping vsyscall support -config GENERIC_TIME_VSYSCALL - bool - # The generic clock events infrastructure config GENERIC_CLOCKEVENTS def_bool !LEGACY_TIMER_TICK diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 6e173d70d825..ea5be5870e51 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -337,48 +337,32 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, EXPORT_SYMBOL_GPL(alarm_init); /** - * alarm_start - Sets an absolute alarm to fire - * @alarm: ptr to alarm to set - * @start: time to run the alarm + * alarm_start_timer - Sets an alarm to fire + * @alarm: Pointer to alarm to set + * @expires: Expiry time + * @relative: True if @expires is relative + * + * Returns: True if the alarm was queued. False if it already expired */ -void alarm_start(struct alarm *alarm, ktime_t start) +bool alarm_start_timer(struct alarm *alarm, ktime_t expires, bool relative) { struct alarm_base *base = &alarm_bases[alarm->type]; - scoped_guard(spinlock_irqsave, &base->lock) { - alarm->node.expires = start; - alarmtimer_enqueue(base, alarm); - hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); - } + if (relative) + expires = ktime_add_safe(expires, base->get_ktime()); trace_alarmtimer_start(alarm, base->get_ktime()); -} -EXPORT_SYMBOL_GPL(alarm_start); - -/** - * alarm_start_relative - Sets a relative alarm to fire - * @alarm: ptr to alarm to set - * @start: time relative to now to run the alarm - */ -void alarm_start_relative(struct alarm *alarm, ktime_t start) -{ - struct alarm_base *base = &alarm_bases[alarm->type]; - - start = ktime_add_safe(start, base->get_ktime()); - alarm_start(alarm, start); -} -EXPORT_SYMBOL_GPL(alarm_start_relative); - -void alarm_restart(struct alarm *alarm) -{ - struct alarm_base *base = &alarm_bases[alarm->type]; guard(spinlock_irqsave)(&base->lock); - hrtimer_set_expires(&alarm->timer, alarm->node.expires); - hrtimer_restart(&alarm->timer); + alarm->node.expires = expires; alarmtimer_enqueue(base, alarm); + if (!hrtimer_start_range_ns_user(&alarm->timer, expires, 0, HRTIMER_MODE_ABS)) { + alarmtimer_dequeue(base, alarm); + return false; + } + return true; } -EXPORT_SYMBOL_GPL(alarm_restart); +EXPORT_SYMBOL_GPL(alarm_start_timer); /** * alarm_try_to_cancel - Tries to cancel an alarm timer @@ -512,8 +496,6 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) * @now: time at the timer expiration * * Posix timer callback for expired alarm timers. - * - * Return: whether the timer is to be restarted */ static void alarm_handle_timer(struct alarm *alarm, ktime_t now) { @@ -527,12 +509,12 @@ static void alarm_handle_timer(struct alarm *alarm, ktime_t now) * alarm_timer_rearm - Posix timer callback for rearming timer * @timr: Pointer to the posixtimer data struct */ -static void alarm_timer_rearm(struct k_itimer *timr) +static bool alarm_timer_rearm(struct k_itimer *timr) { struct alarm *alarm = &timr->it.alarm.alarmtimer; timr->it_overrun += alarm_forward_now(alarm, timr->it_interval); - alarm_start(alarm, alarm->node.expires); + return alarm_start_timer(alarm, alarm->node.expires, false); } /** @@ -588,7 +570,7 @@ static void alarm_timer_wait_running(struct k_itimer *timr) * @absolute: Expiry value is absolute time * @sigev_none: Posix timer does not deliver signals */ -static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, +static bool alarm_timer_arm(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none) { struct alarm *alarm = &timr->it.alarm.alarmtimer; @@ -596,10 +578,16 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, if (!absolute) expires = ktime_add_safe(expires, base->get_ktime()); - if (sigev_none) + + /* + * sigev_none needs to update the expires value and pretend + * that the timer is queued + */ + if (sigev_none) { alarm->node.expires = expires; - else - alarm_start(&timr->it.alarm.alarmtimer, expires); + return true; + } + return alarm_start_timer(&timr->it.alarm.alarmtimer, expires, false); } /** @@ -706,7 +694,9 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, alarm->data = (void *)current; do { set_current_state(TASK_INTERRUPTIBLE); - alarm_start(alarm, absexp); + if (!alarm_start_timer(alarm, absexp, false)) + alarm->data = NULL; + if (likely(alarm->data)) schedule(); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5e22697b098d..0014d163f989 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -301,7 +301,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) #include <asm/clock_inlined.h> #else static __always_inline void -arch_inlined_clockevent_set_next_coupled(u64 u64 cycles, struct clock_event_device *dev) { } +arch_inlined_clockevent_set_next_coupled(u64 cycles, struct clock_event_device *dev) { } #endif static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index baee13a1f87f..e48c4d379a7c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -1222,14 +1222,8 @@ static void clocksource_enqueue(struct clocksource *cs) * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale - * - * This should only be called from the clocksource->enable() method. - * - * This *SHOULD NOT* be called directly! Please use the - * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper - * functions. */ -void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) +static void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) { u64 sec; @@ -1287,7 +1281,6 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); } -EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); /** * __clocksource_register_scale - Used to install new clocksources @@ -1338,6 +1331,26 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) } EXPORT_SYMBOL_GPL(__clocksource_register_scale); +static void __devm_clocksource_unregister(void *data) +{ + struct clocksource *cs = data; + + clocksource_unregister(cs); +} + +int __devm_clocksource_register_scale(struct device *dev, struct clocksource *cs, + u32 scale, u32 freq) +{ + int ret; + + ret = __clocksource_register_scale(cs, scale, freq); + if (ret) + return ret; + + return devm_add_action_or_reset(dev, __devm_clocksource_unregister, cs); +} +EXPORT_SYMBOL_GPL(__devm_clocksource_register_scale); + /* * Unbind clocksource @cs. Called with clocksource_mutex held */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5bd6efe598f0..638ce623c342 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1352,8 +1352,14 @@ static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool return hrtimer_prefer_local(is_local, is_first, is_pinned); } -static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, - const enum hrtimer_mode mode, struct hrtimer_clock_base *base) +enum { + HRTIMER_REPROGRAM_NONE, + HRTIMER_REPROGRAM, + HRTIMER_REPROGRAM_FORCE, +}; + +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, + const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); bool is_pinned, first, was_first, keep_base = false; @@ -1410,7 +1416,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del /* If a deferred rearm is pending skip reprogramming the device */ if (cpu_base->deferred_rearm) { cpu_base->deferred_needs_update = true; - return false; + return HRTIMER_REPROGRAM_NONE; } if (!was_first || cpu_base != this_cpu_base) { @@ -1423,7 +1429,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del * callbacks. */ if (likely(hrtimer_base_is_online(this_cpu_base))) - return first; + return first ? HRTIMER_REPROGRAM : HRTIMER_REPROGRAM_NONE; /* * Timer was enqueued remote because the current base is @@ -1432,7 +1438,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del */ if (first) smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd); - return false; + return HRTIMER_REPROGRAM_NONE; } /* @@ -1446,7 +1452,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del */ if (timer->is_lazy) { if (cpu_base->expires_next <= hrtimer_get_expires(timer)) - return false; + return HRTIMER_REPROGRAM_NONE; } /* @@ -1455,8 +1461,24 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del * reprogram the hardware by evaluating the new first expiring * timer. */ - hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); - return false; + return HRTIMER_REPROGRAM_FORCE; +} + +static int hrtimer_start_range_ns_common(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode, + struct hrtimer_clock_base *base) +{ + /* + * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft + * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard + * expiry mode because unmarked timers are moved to softirq expiry. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + else + WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); + + return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, base); } /** @@ -1476,24 +1498,104 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, debug_hrtimer_assert_init(timer); + base = lock_hrtimer_base(timer, &flags); + + switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) { + case HRTIMER_REPROGRAM: + hrtimer_reprogram(timer, true); + break; + case HRTIMER_REPROGRAM_FORCE: + hrtimer_force_reprogram(timer->base->cpu_base, 1); + break; + case HRTIMER_REPROGRAM_NONE: + break; + } + + unlock_hrtimer_base(timer, &flags); +} +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); + +static inline bool hrtimer_check_user_timer(struct hrtimer *timer) +{ + struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base; + ktime_t expires; + /* - * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft - * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard - * expiry mode because unmarked timers are moved to softirq expiry. + * This uses soft expires because that's the user provided + * expiry time, while expires can be further in the past + * due to a slack value added to the user expiry time. */ - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); - else - WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); + expires = hrtimer_get_softexpires(timer); + + /* Convert to monotonic */ + expires = ktime_sub(expires, timer->base->offset); + + /* + * Check whether this timer will end up as the first expiring timer in + * the CPU base. If not, no further checks required as it's then + * guaranteed to expire in the future. + */ + if (expires >= cpu_base->expires_next) + return true; + + /* Validate that the expiry time is in the future. */ + if (expires > ktime_get()) + return true; + + debug_hrtimer_deactivate(timer); + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_INACTIVE, false); + trace_hrtimer_start_expired(timer); + return false; +} + +/** + * hrtimer_start_range_ns_user - (re)start an user controlled hrtimer + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); + * softirq based mode is considered for debug purpose only! + * + * Returns: True when the timer was queued, false if it was already expired + * + * This function cannot invoke the timer callback for expired timers as it might + * be called under a lock which the timer callback needs to acquire. So the + * caller has to handle that case. + */ +bool hrtimer_start_range_ns_user(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + bool ret = true; + + debug_hrtimer_assert_init(timer); base = lock_hrtimer_base(timer, &flags); - if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) - hrtimer_reprogram(timer, true); + switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) { + case HRTIMER_REPROGRAM: + ret = hrtimer_check_user_timer(timer); + if (ret) + hrtimer_reprogram(timer, true); + break; + case HRTIMER_REPROGRAM_FORCE: + ret = hrtimer_check_user_timer(timer); + /* + * The base must always be reevaluated, independent of the + * result above because the timer was the first pending timer. + */ + hrtimer_force_reprogram(timer->base->cpu_base, 1); + break; + case HRTIMER_REPROGRAM_NONE: + break; + } unlock_hrtimer_base(timer, &flags); + return ret; } -EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns_user); /** * hrtimer_try_to_cancel - try to deactivate a timer @@ -1681,10 +1783,10 @@ EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); * * Returns the next expiry time or KTIME_MAX if no timer is pending. */ -u64 hrtimer_get_next_event(void) +ktime_t hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - u64 expires = KTIME_MAX; + ktime_t expires = KTIME_MAX; guard(raw_spinlock_irqsave)(&cpu_base->lock); if (!hrtimer_hres_active(cpu_base)) @@ -1700,10 +1802,10 @@ u64 hrtimer_get_next_event(void) * Returns the next expiry time over all timers except for the @exclude one or * KTIME_MAX if none of them is pending. */ -u64 hrtimer_next_event_without(const struct hrtimer *exclude) +ktime_t hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - u64 expires = KTIME_MAX; + ktime_t expires = KTIME_MAX; unsigned int active; guard(raw_spinlock_irqsave)(&cpu_base->lock); @@ -2213,7 +2315,11 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) mode |= HRTIMER_MODE_HARD; - hrtimer_start_expires(&sl->timer, mode); + /* If already expired, clear the task pointer and set current state to running */ + if (!hrtimer_start_expires_user(&sl->timer, mode)) { + sl->task = NULL; + __set_current_state(TASK_RUNNING); + } } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 1c954f330dfe..d51428867a33 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -60,15 +60,14 @@ EXPORT_SYMBOL(get_jiffies_64); EXPORT_SYMBOL(jiffies); -static int __init init_jiffies_clocksource(void) -{ - return __clocksource_register(&clocksource_jiffies); -} - -core_initcall(init_jiffies_clocksource); +static bool cs_jiffies_registered __initdata; struct clocksource * __init __weak clocksource_default_clock(void) { + if (!cs_jiffies_registered) { + __clocksource_register(&clocksource_jiffies); + cs_jiffies_registered = true; + } return &clocksource_jiffies; } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 4bca3f78c8ea..5fa0af66cf3f 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -57,6 +57,7 @@ ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, return tim; } +EXPORT_SYMBOL_GPL(do_timens_ktime_to_host); static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { @@ -351,6 +352,7 @@ struct time_namespace init_time_ns = { .user_ns = &init_user_ns, .frozen_offsets = true, }; +EXPORT_SYMBOL_GPL(init_time_ns); void __init time_ns_init(void) { diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0de2bb7cbec0..74775b94d11b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -19,7 +19,7 @@ #include "posix-timers.h" -static void posix_cpu_timer_rearm(struct k_itimer *timer); +static bool posix_cpu_timer_rearm(struct k_itimer *timer); void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { @@ -1011,24 +1011,27 @@ static void check_process_timers(struct task_struct *tsk, /* * This is called from the signal code (via posixtimer_rearm) * when the last timer signal was delivered and we have to reload the timer. + * + * Return true unconditionally so the core code assumes the timer to be + * armed. Otherwise it would requeue the signal. */ -static void posix_cpu_timer_rearm(struct k_itimer *timer) +static bool posix_cpu_timer_rearm(struct k_itimer *timer) { clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - struct task_struct *p; struct sighand_struct *sighand; + struct task_struct *p; unsigned long flags; u64 now; - rcu_read_lock(); + guard(rcu)(); p = cpu_timer_task_rcu(timer); if (!p) - goto out; + return true; /* Protect timer list r/w in arm_timer() */ sighand = lock_task_sighand(p, &flags); if (unlikely(sighand == NULL)) - goto out; + return true; /* * Fetch the current sample and update the timer's expiry time. @@ -1045,8 +1048,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) */ arm_timer(timer, p); unlock_task_sighand(p, &flags); -out: - rcu_read_unlock(); + return true; } /** @@ -1504,6 +1506,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, spin_lock_irq(&timer.it_lock); error = posix_cpu_timer_set(&timer, flags, &it, NULL); if (error) { + posix_cpu_timer_del(&timer); spin_unlock_irq(&timer.it_lock); return error; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 9331e1614124..436ba794cc0b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -288,16 +288,18 @@ static inline int timer_overrun_to_int(struct k_itimer *timr) return (int)timr->it_overrun_last; } -static void common_hrtimer_rearm(struct k_itimer *timr) +static bool common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval); - hrtimer_restart(timer); + return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS); } static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr) { + bool queued; + guard(spinlock)(&timr->it_lock); /* @@ -311,12 +313,18 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) return true; - timr->kclock->timer_rearm(timr); - timr->it_status = POSIX_TIMER_ARMED; + /* timer_rearm() updates timr::it_overrun */ + queued = timr->kclock->timer_rearm(timr); + timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1LL; ++timr->it_signal_seq; info->si_overrun = timer_overrun_to_int(timr); + + if (queued) + timr->it_status = POSIX_TIMER_ARMED; + else + posix_timer_queue_signal(timr); return true; } @@ -795,7 +803,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) return timer_overrun_to_int(scoped_timer); } -static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, +static bool common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none) { struct hrtimer *timer = &timr->it.real.timer; @@ -820,8 +828,11 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer)); hrtimer_set_expires(timer, expires); - if (!sigev_none) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + /* For sigev_none pretend that the timer is queued */ + if (sigev_none) + return true; + + return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS); } static int common_hrtimer_try_to_cancel(struct k_itimer *timr) @@ -903,9 +914,13 @@ int common_timer_set(struct k_itimer *timr, int flags, expires = timens_ktime_to_host(timr->it_clock, expires); sigev_none = timr->it_sigev_notify == SIGEV_NONE; - kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); - if (!sigev_none) - timr->it_status = POSIX_TIMER_ARMED; + if (kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none)) { + if (!sigev_none) + timr->it_status = POSIX_TIMER_ARMED; + } else { + /* Timer was already expired, queue the signal */ + posix_timer_queue_signal(timr); + } return 0; } diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 7f259e845d24..4ea9611dd716 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -27,11 +27,11 @@ struct k_clock { int (*timer_del)(struct k_itimer *timr); void (*timer_get)(struct k_itimer *timr, struct itimerspec64 *cur_setting); - void (*timer_rearm)(struct k_itimer *timr); + bool (*timer_rearm)(struct k_itimer *timr); s64 (*timer_forward)(struct k_itimer *timr, ktime_t now); ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); int (*timer_try_to_cancel)(struct k_itimer *timr); - void (*timer_arm)(struct k_itimer *timr, ktime_t expires, + bool (*timer_arm)(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none); void (*timer_wait_running)(struct k_itimer *timr); }; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cbbb87a0c6e7..98a9cae915c0 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -285,8 +285,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { touch_softlockup_watchdog_sched(); - if (is_idle_task(current)) - ts->idle_jiffies++; /* * In case the current tick fired too early past its expected * expiration, make sure we don't bypass the next clock reprogramming @@ -751,119 +749,6 @@ static void tick_nohz_update_jiffies(ktime_t now) touch_softlockup_watchdog_sched(); } -static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) -{ - ktime_t delta; - - if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))) - return; - - delta = ktime_sub(now, ts->idle_entrytime); - - write_seqcount_begin(&ts->idle_sleeptime_seq); - if (nr_iowait_cpu(smp_processor_id()) > 0) - ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); - else - ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - - ts->idle_entrytime = now; - tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE); - write_seqcount_end(&ts->idle_sleeptime_seq); - - sched_clock_idle_wakeup_event(); -} - -static void tick_nohz_start_idle(struct tick_sched *ts) -{ - write_seqcount_begin(&ts->idle_sleeptime_seq); - ts->idle_entrytime = ktime_get(); - tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE); - write_seqcount_end(&ts->idle_sleeptime_seq); - - sched_clock_idle_sleep_event(); -} - -static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, - bool compute_delta, u64 *last_update_time) -{ - ktime_t now, idle; - unsigned int seq; - - if (!tick_nohz_active) - return -1; - - now = ktime_get(); - if (last_update_time) - *last_update_time = ktime_to_us(now); - - do { - seq = read_seqcount_begin(&ts->idle_sleeptime_seq); - - if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) { - ktime_t delta = ktime_sub(now, ts->idle_entrytime); - - idle = ktime_add(*sleeptime, delta); - } else { - idle = *sleeptime; - } - } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); - - return ktime_to_us(idle); - -} - -/** - * get_cpu_idle_time_us - get the total idle time of a CPU - * @cpu: CPU number to query - * @last_update_time: variable to store update time in. Do not update - * counters if NULL. - * - * Return the cumulative idle time (since boot) for a given - * CPU, in microseconds. Note that this is partially broken due to - * the counter of iowait tasks that can be remotely updated without - * any synchronization. Therefore it is possible to observe backward - * values within two consecutive reads. - * - * This time is measured via accounting rather than sampling, - * and is as accurate as ktime_get() is. - * - * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu - */ -u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) -{ - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - - return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, - !nr_iowait_cpu(cpu), last_update_time); -} -EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); - -/** - * get_cpu_iowait_time_us - get the total iowait time of a CPU - * @cpu: CPU number to query - * @last_update_time: variable to store update time in. Do not update - * counters if NULL. - * - * Return the cumulative iowait time (since boot) for a given - * CPU, in microseconds. Note this is partially broken due to - * the counter of iowait tasks that can be remotely updated without - * any synchronization. Therefore it is possible to observe backward - * values within two consecutive reads. - * - * This time is measured via accounting rather than sampling, - * and is as accurate as ktime_get() is. - * - * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu - */ -u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) -{ - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - - return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, - nr_iowait_cpu(cpu), last_update_time); -} -EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); - /* Simplified variant of hrtimer_forward_now() */ static ktime_t tick_forward_now(ktime_t expires, ktime_t now) { @@ -1273,7 +1158,7 @@ void tick_nohz_idle_stop_tick(void) ts->idle_expires = expires; if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { - ts->idle_jiffies = ts->last_jiffies; + kcpustat_dyntick_start(ts->idle_entrytime); nohz_balance_enter_idle(cpu); } } else { @@ -1286,6 +1171,20 @@ void tick_nohz_idle_retain_tick(void) tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); } +static void tick_nohz_clock_sleep(struct tick_sched *ts) +{ + tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE); + sched_clock_idle_sleep_event(); +} + +static void tick_nohz_clock_wakeup(struct tick_sched *ts) +{ + if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) { + tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE); + sched_clock_idle_wakeup_event(); + } +} + /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * @@ -1300,11 +1199,10 @@ void tick_nohz_idle_enter(void) local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); - WARN_ON_ONCE(ts->timer_expires_base); - tick_sched_flag_set(ts, TS_FLAG_INIDLE); - tick_nohz_start_idle(ts); + ts->idle_entrytime = ktime_get(); + tick_nohz_clock_sleep(ts); local_irq_enable(); } @@ -1332,10 +1230,14 @@ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) - tick_nohz_start_idle(ts); - else + if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) { + tick_nohz_clock_sleep(ts); + ts->idle_entrytime = ktime_get(); + if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) + kcpustat_irq_exit(ts->idle_entrytime); + } else { tick_nohz_full_update_tick(ts); + } } /** @@ -1407,8 +1309,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ - next_event = min_t(u64, next_event, - hrtimer_next_event_without(&ts->sched_timer)); + next_event = min(next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } @@ -1429,36 +1330,20 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) return ts->idle_calls; } -static void tick_nohz_account_idle_time(struct tick_sched *ts, - ktime_t now) -{ - unsigned long ticks; - - ts->idle_exittime = now; - - if (vtime_accounting_enabled_this_cpu()) - return; - /* - * We stopped the tick in idle. update_process_times() would miss the - * time we slept, as it does only a 1 tick accounting. - * Enforce that this is accounted to idle ! - */ - ticks = jiffies - ts->idle_jiffies; - /* - * We might be one off. Do not randomly account a huge number of ticks! - */ - if (ticks && ticks < LONG_MAX) - account_idle_ticks(ticks); -} - void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { - ktime_t now = ktime_get(); - tick_nohz_restart_sched_tick(ts, now); - tick_nohz_account_idle_time(ts, now); + /* + * Update entrytime here in case the tick restart is due to temporary + * polling on forced broadcast. The tick may be stopped again later within + * the same idle trip. The idle_entrytime was updated recently but make sure + * no tiny amount of idle time is accounted twice. + */ + ts->idle_entrytime = ktime_get(); + kcpustat_dyntick_stop(ts->idle_entrytime); + tick_nohz_restart_sched_tick(ts, ts->idle_entrytime); } } @@ -1468,8 +1353,6 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) __tick_nohz_full_update_tick(ts, now); else tick_nohz_restart_sched_tick(ts, now); - - tick_nohz_account_idle_time(ts, now); } /** @@ -1491,7 +1374,6 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); - bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); @@ -1500,17 +1382,13 @@ void tick_nohz_idle_exit(void) WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_clear(ts, TS_FLAG_INIDLE); - idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE); - tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); + tick_nohz_clock_wakeup(ts); - if (idle_active || tick_stopped) + if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { now = ktime_get(); - - if (idle_active) - tick_nohz_stop_idle(ts, now); - - if (tick_stopped) + kcpustat_dyntick_stop(now); tick_nohz_idle_update_tick(ts, now); + } local_irq_enable(); } @@ -1565,11 +1443,14 @@ static inline void tick_nohz_irq_enter(void) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; - if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE)) + tick_nohz_clock_wakeup(ts); + + if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) return; + now = ktime_get(); - if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) - tick_nohz_stop_idle(ts, now); + kcpustat_irq_enter(now); + /* * If all CPUs are idle we may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay @@ -1577,8 +1458,7 @@ static inline void tick_nohz_irq_enter(void) * rare case (typically stop machine). So we must make sure we have a * last resort. */ - if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) - tick_nohz_update_jiffies(now); + tick_nohz_update_jiffies(now); } #else @@ -1648,20 +1528,15 @@ void tick_setup_sched_timer(bool hrtimer) void tick_sched_timer_dying(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - ktime_t idle_sleeptime, iowait_sleeptime; unsigned long idle_calls, idle_sleeps; /* This must happen before hrtimers are migrated! */ if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); - idle_sleeptime = ts->idle_sleeptime; - iowait_sleeptime = ts->iowait_sleeptime; idle_calls = ts->idle_calls; idle_sleeps = ts->idle_sleeps; memset(ts, 0, sizeof(*ts)); - ts->idle_sleeptime = idle_sleeptime; - ts->iowait_sleeptime = iowait_sleeptime; ts->idle_calls = idle_calls; ts->idle_sleeps = idle_sleeps; } diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index b4a7822f495d..79b9252047b1 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -44,9 +44,7 @@ struct tick_device { * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. * @next_tick: Next tick to be fired when in dynticks mode. - * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_waketime: Time when the idle was interrupted - * @idle_sleeptime_seq: sequence counter for data consistency * @idle_entrytime: Time when the idle call was entered * @last_jiffies: Base jiffies snapshot when next event was last computed * @timer_expires_base: Base time clock monotonic for @timer_expires @@ -55,9 +53,6 @@ struct tick_device { * @idle_expires: Next tick in idle, for debugging purpose only * @idle_calls: Total number of idle calls * @idle_sleeps: Number of idle calls, where the sched tick was stopped - * @idle_exittime: Time when the idle state was left - * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped - * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick * @check_clocks: Notification mechanism about clocksource changes */ @@ -73,12 +68,10 @@ struct tick_sched { struct hrtimer sched_timer; ktime_t last_tick; ktime_t next_tick; - unsigned long idle_jiffies; ktime_t idle_waketime; unsigned int got_idle_tick; /* Idle entry */ - seqcount_t idle_sleeptime_seq; ktime_t idle_entrytime; /* Tick stop */ @@ -90,11 +83,6 @@ struct tick_sched { unsigned long idle_calls; unsigned long idle_sleeps; - /* Idle exit */ - ktime_t idle_exittime; - ktime_t idle_sleeptime; - ktime_t iowait_sleeptime; - /* Full dynticks handling */ atomic_t tick_dep_mask; diff --git a/kernel/time/time.c b/kernel/time/time.c index 0d832317d576..771cef87ad3b 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -207,7 +207,7 @@ SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv, get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; - if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) + if (new_ts.tv_nsec >= USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; new_ts.tv_nsec *= NSEC_PER_USEC; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c493a4010305..0d5b67f609bb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -67,6 +67,7 @@ static inline bool tk_is_aux(const struct timekeeper *tk) { return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST; } +static inline struct tk_data *aux_get_tk_data(clockid_t id); #else static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) { @@ -77,6 +78,10 @@ static inline bool tk_is_aux(const struct timekeeper *tk) { return false; } +static inline struct tk_data *aux_get_tk_data(clockid_t id) +{ + return NULL; +} #endif static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs) @@ -315,6 +320,7 @@ static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) return clock->read(clock); } + static inline void clocksource_disable_inline_read(void) { } static inline void clocksource_enable_inline_read(void) { } #endif @@ -1182,44 +1188,107 @@ noinstr time64_t __ktime_get_real_seconds(void) return tk->xtime_sec; } -/** - * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter - * @systime_snapshot: pointer to struct receiving the system time snapshot - */ -void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) +static inline u64 tk_clock_read_snapshot(const struct tk_read_base *tkr, + struct clocksource_hw_snapshot *chs) { - struct timekeeper *tk = &tk_core.timekeeper; + struct clocksource *clock = READ_ONCE(tkr->clock); + + if (unlikely(clock->read_snapshot)) + return clock->read_snapshot(clock, chs); + + return clock->read(clock); +} + + +/** + * ktime_get_snapshot_id - Simultaneously snapshot a given clock ID with + * CLOCK_MONOTONIC_RAW and the underlying + * clocksource counter value. + * @clock_id: The clock ID to snapshot + * @systime_snapshot: Pointer to struct receiving the system time snapshot + */ +void ktime_get_snapshot_id(clockid_t clock_id, struct system_time_snapshot *systime_snapshot) +{ + ktime_t base_raw, base_sys, offs_sys, *offs, offs_zero = 0; + u64 nsec_raw, nsec_sys, now; + struct timekeeper *tk; + struct tk_data *tkd; unsigned int seq; - ktime_t base_raw; - ktime_t base_real; - ktime_t base_boot; - u64 nsec_raw; - u64 nsec_real; - u64 now; - WARN_ON_ONCE(timekeeping_suspended); + /* Invalidate the snapshot for all failure cases */ + systime_snapshot->valid = false; + + if (WARN_ON_ONCE(timekeeping_suspended)) + return; + + switch (clock_id) { + case CLOCK_REALTIME: + tkd = &tk_core; + offs = &tk_core.timekeeper.offs_real; + break; + /* Map RAW to MONOTONIC so the loop below is trivial */ + case CLOCK_MONOTONIC_RAW: + case CLOCK_MONOTONIC: + tkd = &tk_core; + offs = &offs_zero; + break; + case CLOCK_BOOTTIME: + tkd = &tk_core; + offs = &tk_core.timekeeper.offs_boot; + break; + case CLOCK_AUX ... CLOCK_AUX_LAST: + tkd = aux_get_tk_data(clock_id); + if (!tkd) + return; + offs = &tkd->timekeeper.offs_aux; + break; + default: + WARN_ON_ONCE(1); + return; + } + + tk = &tkd->timekeeper; do { - seq = read_seqcount_begin(&tk_core.seq); - now = tk_clock_read(&tk->tkr_mono); + struct clocksource_hw_snapshot chs = { }; + + seq = read_seqcount_begin(&tkd->seq); + + /* Aux clocks can be invalid */ + if (!tk->clock_valid) + return; + + now = tk_clock_read_snapshot(&tk->tkr_mono, &chs); systime_snapshot->cs_id = tk->tkr_mono.clock->id; + + systime_snapshot->hw_cycles = chs.hw_cycles; + systime_snapshot->hw_csid = chs.hw_csid; + systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; - base_real = ktime_add(tk->tkr_mono.base, - tk_core.timekeeper.offs_real); - base_boot = ktime_add(tk->tkr_mono.base, - tk_core.timekeeper.offs_boot); + + base_sys = tk->tkr_mono.base; + offs_sys = *offs; base_raw = tk->tkr_raw.base; - nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); - nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); - } while (read_seqcount_retry(&tk_core.seq, seq)); + + nsec_sys = timekeeping_cycles_to_ns(&tk->tkr_mono, now); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); + } while (read_seqcount_retry(&tkd->seq, seq)); systime_snapshot->cycles = now; - systime_snapshot->real = ktime_add_ns(base_real, nsec_real); - systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real); - systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); + systime_snapshot->systime = ktime_add_ns(base_sys, offs_sys + nsec_sys); + systime_snapshot->monoraw = ktime_add_ns(base_raw, nsec_raw); + + /* + * Special case for PTP. Just transfer the raw time into sys, + * so the call sites can consistently use snap::systime. + */ + if (clock_id == CLOCK_MONOTONIC_RAW) + systime_snapshot->systime = systime_snapshot->monoraw; + /* Tell the consumer that this snapshot is valid */ + systime_snapshot->valid = true; } -EXPORT_SYMBOL_GPL(ktime_get_snapshot); +EXPORT_SYMBOL_GPL(ktime_get_snapshot_id); /* Scale base by mult/div checking for overflow */ static int scale64_check_overflow(u64 mult, u64 div, u64 *base) @@ -1262,7 +1331,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history, struct system_device_crosststamp *ts) { struct timekeeper *tk = &tk_core.timekeeper; - u64 corr_raw, corr_real; + u64 corr_raw, corr_sys; bool interp_forward; int ret; @@ -1279,8 +1348,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history, * Scale the monotonic raw time delta by: * partial_history_cycles / total_history_cycles */ - corr_raw = (u64)ktime_to_ns( - ktime_sub(ts->sys_monoraw, history->raw)); + corr_raw = (u64)ktime_to_ns(ktime_sub(ts->sys_monoraw, history->monoraw)); ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, &corr_raw); if (ret) @@ -1288,30 +1356,29 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history, /* * If there is a discontinuity in the history, scale monotonic raw - * correction by: - * mult(real)/mult(raw) yielding the realtime correction - * Otherwise, calculate the realtime correction similar to monotonic - * raw calculation + * correction by: + * mult(sys)/mult(raw) yielding the system time correction + * + * Otherwise, calculate the system time correction similar to monotonic + * raw calculation */ if (discontinuity) { - corr_real = mul_u64_u32_div - (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); + corr_sys = mul_u64_u32_div(corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); } else { - corr_real = (u64)ktime_to_ns( - ktime_sub(ts->sys_realtime, history->real)); - ret = scale64_check_overflow(partial_history_cycles, - total_history_cycles, &corr_real); + corr_sys = (u64)ktime_to_ns(ktime_sub(ts->sys_systime, history->systime)); + ret = scale64_check_overflow(partial_history_cycles, total_history_cycles, + &corr_sys); if (ret) return ret; } - /* Fixup monotonic raw and real time time values */ + /* Fixup monotonic raw and system time time values */ if (interp_forward) { - ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); - ts->sys_realtime = ktime_add_ns(history->real, corr_real); + ts->sys_monoraw = ktime_add_ns(history->monoraw, corr_raw); + ts->sys_systime = ktime_add_ns(history->systime, corr_sys); } else { ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); - ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); + ts->sys_systime = ktime_sub_ns(ts->sys_systime, corr_sys); } return 0; @@ -1368,6 +1435,8 @@ static bool convert_base_to_cs(struct system_counterval_t *scv) return false; scv->cycles += base->offset; + /* Set the clocksource ID as scv::cycles is now clocksource based */ + scv->cs_id = cs->id; return true; } @@ -1435,11 +1504,11 @@ EXPORT_SYMBOL_GPL(ktime_real_to_base_clock); /** * get_device_system_crosststamp - Synchronously capture system/device timestamp - * @get_time_fn: Callback to get simultaneous device time and - * system counter from the device driver + * @get_time_fn: Callback to get simultaneous device time and system counter + * from the device driver * @ctx: Context passed to get_time_fn() - * @history_begin: Historical reference point used to interpolate system - * time when counter provided by the driver is before the current interval + * @history_begin: Historical reference point used to interpolate system time when + * the counter value provided by the driver is before the current interval * @xtstamp: Receives simultaneously captured system and device time * * Reads a timestamp from a device and correlates it to system time @@ -1452,36 +1521,54 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { - struct system_counterval_t system_counterval = {}; - struct timekeeper *tk = &tk_core.timekeeper; - u64 cycles, now, interval_start; - unsigned int clock_was_set_seq = 0; - ktime_t base_real, base_raw; - u64 nsec_real, nsec_raw; + u64 syscnt_cycles, cycles, now, interval_start; + unsigned int seq, clock_was_set_seq = 0; + ktime_t base_sys, base_raw, *offs; + u64 nsec_sys, nsec_raw; u8 cs_was_changed_seq; - unsigned int seq; bool do_interp; + struct timekeeper *tk; + struct tk_data *tkd; int ret; + switch (xtstamp->clock_id) { + case CLOCK_REALTIME: + tkd = &tk_core; + offs = &tk_core.timekeeper.offs_real; + break; + case CLOCK_AUX ... CLOCK_AUX_LAST: + tkd = aux_get_tk_data(xtstamp->clock_id); + if (!tkd) + return -ENODEV; + offs = &tkd->timekeeper.offs_aux; + break; + default: + WARN_ON_ONCE(1); + return -ENODEV; + } + + tk = &tkd->timekeeper; + do { - seq = read_seqcount_begin(&tk_core.seq); + seq = read_seqcount_begin(&tkd->seq); /* * Try to synchronously capture device time and a system * counter value calling back into the device driver */ - ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); + ret = get_time_fn(&xtstamp->device, &xtstamp->sys_counter, ctx); if (ret) return ret; /* * Verify that the clocksource ID associated with the captured * system counter value is the same as for the currently - * installed timekeeper clocksource + * installed timekeeper clocksource and convert to it. */ - if (system_counterval.cs_id == CSID_GENERIC || - !convert_base_to_cs(&system_counterval)) + if (xtstamp->sys_counter.cs_id == CSID_GENERIC || + !convert_base_to_cs(&xtstamp->sys_counter)) return -ENODEV; - cycles = system_counterval.cycles; + + cycles = syscnt_cycles = xtstamp->sys_counter.cycles; /* * Check whether the system counter value provided by the @@ -1498,15 +1585,14 @@ int get_device_system_crosststamp(int (*get_time_fn) do_interp = false; } - base_real = ktime_add(tk->tkr_mono.base, - tk_core.timekeeper.offs_real); + base_sys = ktime_add(tk->tkr_mono.base, *offs); base_raw = tk->tkr_raw.base; - nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles); + nsec_sys = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles); nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles); - } while (read_seqcount_retry(&tk_core.seq, seq)); + } while (read_seqcount_retry(&tkd->seq, seq)); - xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); + xtstamp->sys_systime = ktime_add_ns(base_sys, nsec_sys); xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); /* @@ -1523,24 +1609,19 @@ int get_device_system_crosststamp(int (*get_time_fn) * clocksource change */ if (!history_begin || - !timestamp_in_interval(history_begin->cycles, - cycles, system_counterval.cycles) || + !timestamp_in_interval(history_begin->cycles, cycles, syscnt_cycles) || history_begin->cs_was_changed_seq != cs_was_changed_seq) return -EINVAL; - partial_history_cycles = cycles - system_counterval.cycles; + + partial_history_cycles = cycles - syscnt_cycles; total_history_cycles = cycles - history_begin->cycles; - discontinuity = - history_begin->clock_was_set_seq != clock_was_set_seq; + discontinuity = history_begin->clock_was_set_seq != clock_was_set_seq; - ret = adjust_historical_crosststamp(history_begin, - partial_history_cycles, - total_history_cycles, - discontinuity, xtstamp); - if (ret) - return ret; + ret = adjust_historical_crosststamp(history_begin, partial_history_cycles, + total_history_cycles, discontinuity, xtstamp); } - return 0; + return ret; } EXPORT_SYMBOL_GPL(get_device_system_crosststamp); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 04d928c21aba..655a8c6cd84d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1932,7 +1932,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) */ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { - u64 nextevt = hrtimer_get_next_event(); + u64 nextevt = ktime_to_ns(hrtimer_get_next_event()); /* * If high resolution timers are enabled diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 427d7ddea3af..514802def1e0 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -152,14 +152,10 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_flag(highres, TS_FLAG_HIGHRES); P_ns(last_tick); P_flag(tick_stopped, TS_FLAG_STOPPED); - P(idle_jiffies); P(idle_calls); P(idle_sleeps); P_ns(idle_entrytime); P_ns(idle_waketime); - P_ns(idle_exittime); - P_ns(idle_sleeptime); - P_ns(iowait_sleeptime); P(last_jiffies); P(next_timer); P_ns(idle_expires); @@ -256,7 +252,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) static inline void timer_list_header(struct seq_file *m, u64 now) { - SEQ_printf(m, "Timer List Version: v0.10\n"); + SEQ_printf(m, "Timer List Version: v0.11\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); SEQ_printf(m, "\n"); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 155eeaea4113..806c23cf71fc 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -102,7 +102,7 @@ * active CPU/group information atomic_try_cmpxchg() is used instead and only * the per CPU tmigr_cpu->lock is held. * - * During the setup of groups tmigr_level_list is required. It is protected by + * During the setup of groups, hier->level_list is required. It is protected by * @tmigr_mutex. * * When @timer_base->lock as well as tmigr related locks are required, the lock @@ -416,13 +416,12 @@ */ static DEFINE_MUTEX(tmigr_mutex); -static struct list_head *tmigr_level_list __read_mostly; + +static LIST_HEAD(tmigr_hierarchy_list); static unsigned int tmigr_hierarchy_levels __read_mostly; static unsigned int tmigr_crossnode_level __read_mostly; -static struct tmigr_group *tmigr_root; - static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); /* @@ -978,8 +977,12 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, /* Drop the lock to allow the remote CPU to exit idle */ raw_spin_unlock_irq(&tmc->lock); - if (cpu != smp_processor_id()) - timer_expire_remote(cpu); + /* + * This can't exclude the local CPU because jiffies might have advanced + * after the timer softirq invoked run_timer_base(BASE_GLOBAL) and the + * point where the jiffies snapshot @jif was taken in tmigr_handle_remote(). + */ + timer_expire_remote(cpu); /* * Lock ordering needs to be preserved - timer_base locks before tmigr @@ -1465,6 +1468,34 @@ static long tmigr_trigger_active(void *unused) return 0; } +static unsigned int tmigr_get_capacity(int cpu) +{ + /* + * nohz_full CPUs need to make sure there is always an available (online) + * and never idle migrator to handle all their global timers. That duty + * is served by the timekeeper which then never stops its tick. But the + * timekeeper must then belong to the same hierarchy as all the nohz_full + * CPUs. Simply turn off capacity awareness when nohz_full is running. + */ + if (tick_nohz_full_enabled() || !IS_ENABLED(CONFIG_BROKEN)) + return SCHED_CAPACITY_SCALE; + else + return arch_scale_cpu_capacity(cpu); +} + +static struct tmigr_hierarchy *__tmigr_get_hierarchy(int cpu) +{ + unsigned int capacity = tmigr_get_capacity(cpu); + struct tmigr_hierarchy *iter; + + list_for_each_entry(iter, &tmigr_hierarchy_list, node) { + if (iter->capacity == capacity) + return iter; + } + + return NULL; +} + static int tmigr_clear_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); @@ -1489,8 +1520,21 @@ static int tmigr_clear_cpu_available(unsigned int cpu) } if (firstexp != KTIME_MAX) { - migrator = cpumask_any(tmigr_available_cpumask); - work_on_cpu(migrator, tmigr_trigger_active, NULL); + struct tmigr_hierarchy *hier = __tmigr_get_hierarchy(cpu); + + if (WARN_ON_ONCE(!hier)) + return -EINVAL; + + migrator = cpumask_any_and(tmigr_available_cpumask, hier->cpumask); + if (migrator < nr_cpu_ids) { + work_on_cpu(migrator, tmigr_trigger_active, NULL); + } else { + /* + * If deactivation returned an expiration, it belongs to an available + * nohz CPU in the hierarchy. + */ + WARN_ONCE(1, "Expected available CPU in the hierarchy\n"); + } } return 0; @@ -1653,14 +1697,14 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, group->groupevt.ignore = true; } -static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) +static struct tmigr_group *tmigr_get_group(struct tmigr_hierarchy *hier, int node, unsigned int lvl) { struct tmigr_group *tmp, *group = NULL; lockdep_assert_held(&tmigr_mutex); /* Try to attach to an existing group first */ - list_for_each_entry(tmp, &tmigr_level_list[lvl], list) { + list_for_each_entry(tmp, &hier->level_list[lvl], list) { /* * If @lvl is below the cross NUMA node level, check whether * this group belongs to the same NUMA node. @@ -1694,14 +1738,14 @@ static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) tmigr_init_group(group, lvl, node); /* Setup successful. Add it to the hierarchy */ - list_add(&group->list, &tmigr_level_list[lvl]); + list_add(&group->list, &hier->level_list[lvl]); trace_tmigr_group_set(group); return group; } -static bool tmigr_init_root(struct tmigr_group *group, bool activate) +static bool tmigr_init_root(struct tmigr_hierarchy *hier, struct tmigr_group *group, bool activate) { - if (!group->parent && group != tmigr_root) { + if (!group->parent && group != hier->root) { /* * This is the new top-level, prepare its groupmask in advance * to avoid accidents where yet another new top-level is @@ -1717,11 +1761,10 @@ static bool tmigr_init_root(struct tmigr_group *group, bool activate) } -static void tmigr_connect_child_parent(struct tmigr_group *child, - struct tmigr_group *parent, - bool activate) +static void tmigr_connect_child_parent(struct tmigr_hierarchy *hier, struct tmigr_group *child, + struct tmigr_group *parent, bool activate) { - if (tmigr_init_root(parent, activate)) { + if (tmigr_init_root(hier, parent, activate)) { /* * The previous top level had prepared its groupmask already, * simply account it in advance as the first child. If some groups @@ -1754,13 +1797,13 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, */ smp_store_release(&child->parent, parent); - trace_tmigr_connect_child_parent(child); + trace_tmigr_connect_child_parent(hier, child); } -static int tmigr_setup_groups(unsigned int cpu, unsigned int node, - struct tmigr_group *start, bool activate) +static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu, + unsigned int node, struct tmigr_group *start, bool activate) { - struct tmigr_group *group, *child, **stack; + struct tmigr_group *root = hier->root, *group, *child, **stack; int i, top = 0, err = 0, start_lvl = 0; bool root_mismatch = false; @@ -1773,11 +1816,11 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, start_lvl = start->level + 1; } - if (tmigr_root) - root_mismatch = tmigr_root->numa_node != node; + if (root) + root_mismatch = root->numa_node != node; for (i = start_lvl; i < tmigr_hierarchy_levels; i++) { - group = tmigr_get_group(node, i); + group = tmigr_get_group(hier, node, i); if (IS_ERR(group)) { err = PTR_ERR(group); i--; @@ -1799,7 +1842,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, if (group->parent) break; if ((!root_mismatch || i >= tmigr_crossnode_level) && - list_is_singular(&tmigr_level_list[i])) + list_is_singular(&hier->level_list[i])) break; } @@ -1827,15 +1870,15 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, tmc->tmgroup = group; tmc->groupmask = BIT(group->num_children++); - tmigr_init_root(group, activate); + tmigr_init_root(hier, group, activate); - trace_tmigr_connect_cpu_parent(tmc); + trace_tmigr_connect_cpu_parent(hier, tmc); /* There are no children that need to be connected */ continue; } else { child = stack[i - 1]; - tmigr_connect_child_parent(child, group, activate); + tmigr_connect_child_parent(hier, child, group, activate); } } @@ -1860,31 +1903,54 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, * child to the new parents. So tmigr_active_up() activates the * new parents while walking up from the old root to the new. * - * * It is ensured that @start is active, as this setup path is - * executed in hotplug prepare callback. This is executed by an - * already connected and !idle CPU. Even if all other CPUs go idle, - * the CPU executing the setup will be responsible up to current top - * level group. And the next time it goes inactive, it will release - * the new childmask and parent to subsequent walkers through this - * @child. Therefore propagate active state unconditionally. + * * It is ensured that @start is active, (or on the way to be activated + * by another CPU that woke up before the current one) as this setup path + * is executed in hotplug prepare callback. This is executed by an already + * connected and !idle CPU in the hierarchy. + * + * * The below RmW atomic operation ensures that: + * + * 1) If the old root has been completely activated, the latest state is + * acquired (the below implicit acquire pairs with the implicit release + * from cmpxchg() in tmigr_active_up()). + * + * 2) If the old root is still on the way to be activated, the lagging behind + * CPU performing the activation will acquire the links up to the new root. + * (The below implicit release pairs with the implicit acquire from cmpxchg() + * in tmigr_active_up()). + * + * 3) Every subsequent CPU below the old root will acquire the new links while + * walking through the old root (The below implicit release pairs with the + * implicit acquire from cmpxchg() in either tmigr_active_up()) or + * tmigr_inactive_up(). */ - state.state = atomic_read(&start->migr_state); - WARN_ON_ONCE(!state.active); + state.state = atomic_fetch_or(0, &start->migr_state); WARN_ON_ONCE(!start->parent); - data.childmask = start->groupmask; - __walk_groups_from(tmigr_active_up, &data, start, start->parent); + /* + * If the state of the old root is inactive, another CPU is on its way to activate + * it and propagate to the new root. + */ + if (state.active) { + data.childmask = start->groupmask; + __walk_groups_from(tmigr_active_up, &data, start, start->parent); + } + } else if (start) { + union tmigr_state state; + + /* Remote activation assumes the whole target's hierarchy is inactive */ + state.state = atomic_read(&start->migr_state); + WARN_ON_ONCE(state.active); } /* Root update */ - if (list_is_singular(&tmigr_level_list[top])) { - group = list_first_entry(&tmigr_level_list[top], - typeof(*group), list); + if (list_is_singular(&hier->level_list[top])) { + group = list_first_entry(&hier->level_list[top], typeof(*group), list); WARN_ON_ONCE(group->parent); - if (tmigr_root) { + if (root) { /* Old root should be the same or below */ - WARN_ON_ONCE(tmigr_root->level > top); + WARN_ON_ONCE(root->level > top); } - tmigr_root = group; + hier->root = group; } out: kfree(stack); @@ -1892,34 +1958,123 @@ out: return err; } +static struct tmigr_hierarchy *tmigr_get_hierarchy(int cpu) +{ + struct tmigr_hierarchy *hier; + + hier = __tmigr_get_hierarchy(cpu); + + if (hier) + return hier; + + hier = kzalloc_flex(*hier, level_list, tmigr_hierarchy_levels); + if (!hier) + return ERR_PTR(-ENOMEM); + + hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!hier->cpumask) { + kfree(hier); + return ERR_PTR(-ENOMEM); + } + + for (int i = 0; i < tmigr_hierarchy_levels; i++) + INIT_LIST_HEAD(&hier->level_list[i]); + + hier->capacity = tmigr_get_capacity(cpu); + list_add_tail(&hier->node, &tmigr_hierarchy_list); + + return hier; +} + +static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu, + struct tmigr_group *old_root, bool activate) +{ + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == smp_processor_id()); + if (activate) { + /* + * The current CPU is expected to be online in the hierarchy, + * otherwise the old root may not be active as expected. + */ + WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available)); + } + + return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate); +} + +static long connect_old_root_work(void *arg) +{ + struct tmigr_group *old_root = arg; + struct tmigr_hierarchy *hier; + int cpu = smp_processor_id(); + + hier = __tmigr_get_hierarchy(cpu); + if (WARN_ON_ONCE(!hier)) + return -EINVAL; + + return tmigr_connect_old_root(hier, cpu, old_root, true); +} + static int tmigr_add_cpu(unsigned int cpu) { - struct tmigr_group *old_root = tmigr_root; + struct tmigr_hierarchy *hier; + struct tmigr_group *old_root; int node = cpu_to_node(cpu); int ret; guard(mutex)(&tmigr_mutex); - ret = tmigr_setup_groups(cpu, node, NULL, false); + hier = tmigr_get_hierarchy(cpu); + if (IS_ERR(hier)) + return PTR_ERR(hier); + + old_root = hier->root; + + ret = tmigr_setup_groups(hier, cpu, node, NULL, false); + + if (ret < 0) + return ret; /* Root has changed? Connect the old one to the new */ - if (ret >= 0 && old_root && old_root != tmigr_root) { - /* - * The target CPU must never do the prepare work, except - * on early boot when the boot CPU is the target. Otherwise - * it may spuriously activate the old top level group inside - * the new one (nevertheless whether old top level group is - * active or not) and/or release an uninitialized childmask. - */ - WARN_ON_ONCE(cpu == raw_smp_processor_id()); - /* - * The (likely) current CPU is expected to be online in the hierarchy, - * otherwise the old root may not be active as expected. - */ - WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available); - ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true); + if (old_root && old_root != hier->root) { + guard(migrate)(); + + if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) { + /* + * If the target belong to the same hierarchy, the old root is expected + * to be active. Link and propagate to the new root. + */ + ret = tmigr_connect_old_root(hier, cpu, old_root, true); + } else { + int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask); + + if (target < nr_cpu_ids) { + /* + * If the target doesn't belong to the same hierarchy as the current + * CPU, activate from a relevant one to make sure the old root is + * active. + */ + ret = work_on_cpu(target, connect_old_root_work, old_root); + } else { + /* + * No other available CPUs in the remote hierarchy. Link the + * old root remotely but don't propagate activation since the + * old root is not expected to be active. + */ + ret = tmigr_connect_old_root(hier, cpu, old_root, false); + } + } } + if (ret >= 0) + cpumask_set_cpu(cpu, hier->cpumask); + return ret; } @@ -1952,7 +2107,7 @@ static int tmigr_cpu_prepare(unsigned int cpu) static int __init tmigr_init(void) { - unsigned int cpulvl, nodelvl, cpus_per_node, i; + unsigned int cpulvl, nodelvl, cpus_per_node; unsigned int nnodes = num_possible_nodes(); unsigned int ncpus = num_possible_cpus(); int ret = -ENOMEM; @@ -1999,14 +2154,6 @@ static int __init tmigr_init(void) */ tmigr_crossnode_level = cpulvl; - tmigr_level_list = kzalloc_objs(struct list_head, - tmigr_hierarchy_levels); - if (!tmigr_level_list) - goto err; - - for (i = 0; i < tmigr_hierarchy_levels; i++) - INIT_LIST_HEAD(&tmigr_level_list[i]); - pr_info("Timer migration: %d hierarchy levels; %d children per group;" " %d crossnode level\n", tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 70879cde6fdd..31735dd52327 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -6,6 +6,24 @@ #define TMIGR_CHILDREN_PER_GROUP 8 /** + * struct tmigr_hierarchy - a hierarchy associated to a given CPU capacity. + * Homogeneous systems have only one hierarchy. + * Heterogenous have one hierarchy per CPU capacity. + * @cpumask: CPUs belonging to this hierarchy + * @root: The current root of the hierarchy + * @capacity: CPU capacity associated to this hierarchy + * @node: Node in the global hierarchy list + * @level_list: Per level lists of tmigr groups + */ +struct tmigr_hierarchy { + struct cpumask *cpumask; + struct tmigr_group *root; + unsigned long capacity; + struct list_head node; + struct list_head level_list[]; +}; + +/** * struct tmigr_event - a timer event associated to a CPU * @nextevt: The node to enqueue an event in the parent group queue * @cpu: The CPU to which this event belongs @@ -75,15 +93,17 @@ struct tmigr_group { /** * struct tmigr_cpu - timer migration per CPU group * @lock: Lock protecting the tmigr_cpu group information - * @online: Indicates whether the CPU is online; In deactivate path - * it is required to know whether the migrator in the top - * level group is to be set offline, while a timer is - * pending. Then another online CPU needs to be notified to - * take over the migrator role. Furthermore the information - * is required in CPU hotplug path as the CPU is able to go - * idle before the timer migration hierarchy hotplug AP is - * reached. During this phase, the CPU has to handle the + * @available: Indicates whether the CPU is available for handling + * global timers. In the deactivate path it is required to + * know whether the migrator in the top level group is to + * be set offline, while a timer is pending. Then another + * available CPU needs to be notified to take over the + * migrator role. Furthermore the information is required + * in the CPU hotplug path as the CPU is able to go idle + * before the timer migration hierarchy hotplug callback is + * reached. During this phase, the CPU has to handle the * global timers on its own and must not act as a migrator. + * @idle: Indicates whether the CPU is idle in the timer migration * hierarchy * @remote: Is set when timers of the CPU are expired remotely diff --git a/kernel/torture.c b/kernel/torture.c index 62c1ac777694..77cb3589b19f 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -972,3 +972,19 @@ void _torture_stop_kthread(char *m, struct task_struct **tp) *tp = NULL; } EXPORT_SYMBOL_GPL(_torture_stop_kthread); + +/* + * Set the specified task's niceness value, saturating at limits. + * Saturating noisily, but saturating. + */ +void torture_sched_set_normal(struct task_struct *t, int nice) +{ + int realnice = nice; + + if (WARN_ON_ONCE(realnice > MAX_NICE)) + realnice = MAX_NICE; + if (WARN_ON_ONCE(realnice < MIN_NICE)) + realnice = MIN_NICE; + sched_set_normal(t, realnice); +} +EXPORT_SYMBOL_GPL(torture_sched_set_normal); diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 1decdce8cbef..8d3d96e847d8 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -143,8 +143,8 @@ obj-$(CONFIG_TRACE_REMOTE_TEST) += remote_test.o targets += undefsyms_base.o KASAN_SANITIZE_undefsyms_base.o := y -UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __x86_indirect_thunk \ - __msan simple_ring_buffer \ +UNDEFINED_ALLOWLIST = __asan __gcov __kasan __kcsan __hwasan __sancov __sanitizer __tsan __ubsan __msan \ + __aeabi_unwind_cpp __s390_indirect_jump __x86_indirect_thunk simple_ring_buffer \ $(shell $(NM) -u $(obj)/undefsyms_base.o 2>/dev/null | awk '{print $$2}') quiet_cmd_check_undefined = NM $< @@ -154,7 +154,8 @@ quiet_cmd_check_undefined = NM $< echo "Unexpected symbols in $<:" >&2; \ echo "$$undefsyms" >&2; \ false; \ - fi + fi; \ + touch $@ $(obj)/%.o.checked: $(obj)/%.o $(obj)/undefsyms_base.o FORCE $(call if_changed,check_undefined) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index af7079aa0f36..82f8feea6931 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -23,6 +23,7 @@ #include <linux/sort.h> #include <linux/key.h> #include <linux/namei.h> +#include <linux/file.h> #include <net/bpf_sk_storage.h> @@ -42,6 +43,7 @@ #define MAX_UPROBE_MULTI_CNT (1U << 20) #define MAX_KPROBE_MULTI_CNT (1U << 20) +#define MAX_TRACING_MULTI_CNT (1U << 20) #ifdef CONFIG_MODULES struct bpf_trace_module { @@ -152,6 +154,34 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) return ret; } +/** + * trace_call_bpf_faultable - invoke BPF program in faultable context + * @call: tracepoint event + * @ctx: opaque context pointer + * + * Variant of trace_call_bpf() for faultable tracepoints (syscall + * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace + * for lifetime protection and bpf_prog_run_array_sleepable() for per-program + * RCU flavor selection, following the uprobe pattern. + * + * Per-program recursion protection is provided by + * bpf_prog_run_array_sleepable(). Global bpf_prog_active is not + * needed because syscall tracepoints cannot self-recurse. + * + * Must be called from a faultable/preemptible context. + */ +unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx) +{ + struct bpf_prog_array *prog_array; + + might_fault(); + guard(rcu_tasks_trace)(); + + prog_array = rcu_dereference_check(call->prog_array, + rcu_read_lock_trace_held()); + return bpf_prog_run_array_sleepable(prog_array, ctx, bpf_prog_run); +} + #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { @@ -1305,7 +1335,8 @@ static inline bool is_uprobe_session(const struct bpf_prog *prog) static inline bool is_trace_fsession(const struct bpf_prog *prog) { return prog->type == BPF_PROG_TYPE_TRACING && - prog->expected_attach_type == BPF_TRACE_FSESSION; + (prog->expected_attach_type == BPF_TRACE_FSESSION || + prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI); } static const struct bpf_func_proto * @@ -2072,11 +2103,19 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) static __always_inline void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) { + struct srcu_ctr __percpu *scp = NULL; struct bpf_prog *prog = link->link.prog; + bool sleepable = prog->sleepable; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; - rcu_read_lock_dont_migrate(); + if (sleepable) { + scp = rcu_read_lock_tasks_trace(); + migrate_disable(); + } else { + rcu_read_lock_dont_migrate(); + } + if (unlikely(!bpf_prog_get_recursion_context(prog))) { bpf_prog_inc_misses_counter(prog); goto out; @@ -2085,12 +2124,18 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) run_ctx.bpf_cookie = link->cookie; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); - (void) bpf_prog_run(prog, args); + (void)bpf_prog_run(prog, args); bpf_reset_run_ctx(old_run_ctx); out: bpf_prog_put_recursion_context(prog); - rcu_read_unlock_migrate(); + + if (sleepable) { + migrate_enable(); + rcu_read_unlock_tasks_trace(scp); + } else { + rcu_read_unlock_migrate(); + } } #define UNPACK(...) __VA_ARGS__ @@ -2384,7 +2429,8 @@ static void bpf_kprobe_multi_link_release(struct bpf_link *link) struct bpf_kprobe_multi_link *kmulti_link; kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); - unregister_fprobe(&kmulti_link->fp); + /* Don't wait for RCU GP here. */ + unregister_fprobe_async(&kmulti_link->fp); kprobe_multi_put_modules(kmulti_link->mods, kmulti_link->mods_cnt); } @@ -3169,6 +3215,38 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx) return run_ctx->uprobe->cookie; } +static int bpf_uprobe_multi_get_path(const union bpf_attr *attr, struct path *path) +{ + void __user *upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); + u32 path_fd = attr->link_create.uprobe_multi.path_fd; + u32 flags = attr->link_create.uprobe_multi.flags; + + if (flags & BPF_F_UPROBE_MULTI_PATH_FD) { + /* + * When BPF_F_UPROBE_MULTI_PATH_FD is set, the executable is + * identified by path_fd, upath must be NULL. + */ + if (upath) + return -EINVAL; + + CLASS(fd, f)(path_fd); + if (fd_empty(f)) + return -EBADF; + *path = fd_file(f)->f_path; + path_get(path); + return 0; + } + + /* + * When BPF_F_UPROBE_MULTI_PATH_FD is not set, the path is resolved + * relative to the cwd (AT_FDCWD) or absolute using the upath string. + */ + if (!upath || path_fd) + return -EINVAL; + + return user_path_at(AT_FDCWD, upath, LOOKUP_FOLLOW, path); +} + int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_uprobe_multi_link *link = NULL; @@ -3178,10 +3256,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr struct task_struct *task = NULL; unsigned long __user *uoffsets; u64 __user *ucookies; - void __user *upath; + unsigned long size; u32 flags, cnt, i; struct path path; - char *name; pid_t pid; int err; @@ -3196,19 +3273,18 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr return -EINVAL; flags = attr->link_create.uprobe_multi.flags; - if (flags & ~BPF_F_UPROBE_MULTI_RETURN) + if (flags & ~(BPF_F_UPROBE_MULTI_RETURN | BPF_F_UPROBE_MULTI_PATH_FD)) return -EINVAL; /* - * path, offsets and cnt are mandatory, + * offsets and cnt are mandatory, * ref_ctr_offsets and cookies are optional */ - upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); cnt = attr->link_create.uprobe_multi.cnt; pid = attr->link_create.uprobe_multi.pid; - if (!upath || !uoffsets || !cnt || pid < 0) + if (!uoffsets || !cnt || pid < 0) return -EINVAL; if (cnt > MAX_UPROBE_MULTI_CNT) return -E2BIG; @@ -3216,14 +3292,17 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets); ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies); - name = strndup_user(upath, PATH_MAX); - if (IS_ERR(name)) { - err = PTR_ERR(name); - return err; - } + /* + * All uoffsets/uref_ctr_offsets/ucookies arrays have the same value + * size, we need to check their address range is safe for __get_user + * calls. + */ + size = sizeof(*uoffsets) * cnt; + if (!access_ok(uoffsets, size) || !access_ok(uref_ctr_offsets, size) || + !access_ok(ucookies, size)) + return -EFAULT; - err = kern_path(name, LOOKUP_FOLLOW, &path); - kfree(name); + err = bpf_uprobe_multi_get_path(attr, &path); if (err) return err; @@ -3397,12 +3476,12 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc * direct calls into all the specific callback implementations * (copy_user_data_sleepable, copy_user_data_nofault, and so on) */ -static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size, +static __always_inline int __bpf_dynptr_copy_str(const struct bpf_dynptr *dptr, u64 doff, u64 size, const void *unsafe_src, copy_fn_t str_copy_fn, struct task_struct *tsk) { - struct bpf_dynptr_kern *dst; + const struct bpf_dynptr_kern *dst; u64 chunk_sz, off; void *dst_slice; int cnt, err; @@ -3438,7 +3517,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 u64 size, const void *unsafe_src, copy_fn_t copy_fn, struct task_struct *tsk) { - struct bpf_dynptr_kern *dst; + const struct bpf_dynptr_kern *dst; void *dst_slice; char buf[256]; u64 off, chunk_sz; @@ -3539,49 +3618,49 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid return bpf_send_signal_common(sig, type, task, value); } -__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_user_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_kernel_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign, copy_kernel_data_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_nofault, NULL); } -__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign, copy_kernel_str_nofault, NULL); } -__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_data_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign) { return __bpf_dynptr_copy_str(dptr, off, size, (const void __force *)unsafe_ptr__ign, copy_user_str_sleepable, NULL); } -__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_task_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { @@ -3589,7 +3668,7 @@ __bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off, copy_user_data_sleepable, tsk); } -__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off, +__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(const struct bpf_dynptr *dptr, u64 off, u64 size, const void __user *unsafe_ptr__ign, struct task_struct *tsk) { @@ -3598,3 +3677,203 @@ __bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 } __bpf_kfunc_end_defs(); + +#if defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS) && \ + defined(CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS) + +static void bpf_tracing_multi_link_release(struct bpf_link *link) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + + WARN_ON_ONCE(bpf_trampoline_multi_detach(link->prog, tr_link)); +} + +static void bpf_tracing_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + + kvfree(tr_link->fexits); + kvfree(tr_link->cookies); + kvfree(tr_link); +} + +#ifdef CONFIG_PROC_FS +static void bpf_tracing_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_tracing_multi_link *tr_link = + container_of(link, struct bpf_tracing_multi_link, link); + bool has_cookies = !!tr_link->cookies; + + seq_printf(seq, "attach_type:\t%u\n", tr_link->link.attach_type); + seq_printf(seq, "cnt:\t%u\n", tr_link->nodes_cnt); + + seq_printf(seq, "%s\t %s\t %s\t %s\n", "obj-id", "btf-id", "cookie", "func"); + for (int i = 0; i < tr_link->nodes_cnt; i++) { + struct bpf_tracing_multi_node *mnode = &tr_link->nodes[i]; + u32 btf_id, obj_id; + + bpf_trampoline_unpack_key(mnode->trampoline->key, &obj_id, &btf_id); + seq_printf(seq, "%u\t %u\t %llu\t %pS\n", + obj_id, btf_id, + has_cookies ? tr_link->cookies[i] : 0, + (void *) mnode->trampoline->ip); + + cond_resched(); + } +} +#endif + +static const struct bpf_link_ops bpf_tracing_multi_link_lops = { + .release = bpf_tracing_multi_link_release, + .dealloc_deferred = bpf_tracing_multi_link_dealloc, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_tracing_multi_show_fdinfo, +#endif +}; + +static int ids_cmp_r(const void *pa, const void *pb, const void *priv __maybe_unused) +{ + u32 a = *(u32 *) pa; + u32 b = *(u32 *) pb; + + return (a > b) - (a < b); +} + +static void ids_swap_r(void *a, void *b, int size __maybe_unused, + const void *priv __maybe_unused) +{ + u64 *cookie_a, *cookie_b, *cookies; + u32 *id_a = a, *id_b = b, *ids; + void **data = (void **) priv; + + ids = data[0]; + cookies = data[1]; + + if (cookies) { + cookie_a = cookies + (id_a - ids); + cookie_b = cookies + (id_b - ids); + swap(*cookie_a, *cookie_b); + } + swap(*id_a, *id_b); +} + +static int check_dup_ids(u32 *ids, u64 *cookies, u32 cnt) +{ + void *data[2] = { ids, cookies }; + int err = 0; + + /* + * Sort ids array (together with cookies array if defined) + * and check it for duplicates. The ids and cookies arrays + * are left sorted. + */ + sort_r_nonatomic(ids, cnt, sizeof(ids[0]), ids_cmp_r, ids_swap_r, data); + + for (int i = 1; i < cnt; i++) { + if (ids[i] == ids[i - 1]) { + err = -EINVAL; + break; + } + } + return err; +} + +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + struct bpf_tracing_multi_link *link = NULL; + struct bpf_tramp_node *fexits = NULL; + struct bpf_link_primer link_primer; + u32 cnt, *ids = NULL; + u64 __user *ucookies; + u64 *cookies = NULL; + u32 __user *uids; + int err; + + uids = u64_to_user_ptr(attr->link_create.tracing_multi.ids); + cnt = attr->link_create.tracing_multi.cnt; + + if (!cnt || !uids) + return -EINVAL; + if (cnt > MAX_TRACING_MULTI_CNT) + return -E2BIG; + if (attr->link_create.flags || attr->link_create.target_fd) + return -EINVAL; + + ids = kvmalloc_objs(*ids, cnt); + if (!ids) + return -ENOMEM; + + if (copy_from_user(ids, uids, cnt * sizeof(*ids))) { + err = -EFAULT; + goto error; + } + + ucookies = u64_to_user_ptr(attr->link_create.tracing_multi.cookies); + if (ucookies) { + cookies = kvmalloc_objs(*cookies, cnt); + if (!cookies) { + err = -ENOMEM; + goto error; + } + if (copy_from_user(cookies, ucookies, cnt * sizeof(*cookies))) { + err = -EFAULT; + goto error; + } + } + + err = check_dup_ids(ids, cookies, cnt); + if (err) + goto error; + + if (prog->expected_attach_type == BPF_TRACE_FSESSION_MULTI) { + fexits = kvmalloc_objs(*fexits, cnt); + if (!fexits) { + err = -ENOMEM; + goto error; + } + } + + link = kvzalloc_flex(*link, nodes, cnt); + if (!link) { + err = -ENOMEM; + goto error; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_TRACING_MULTI, + &bpf_tracing_multi_link_lops, prog, prog->expected_attach_type); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error; + + link->nodes_cnt = cnt; + link->cookies = cookies; + link->fexits = fexits; + + err = bpf_trampoline_multi_attach(prog, ids, link); + kvfree(ids); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + return bpf_link_settle(&link_primer); + +error: + kvfree(fexits); + kvfree(cookies); + kvfree(ids); + kvfree(link); + return err; +} + +#else + +int bpf_tracing_multi_attach(struct bpf_prog *prog, const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} + +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS && CONFIG_HAVE_SINGLE_FTRACE_DIRECT_OPS */ diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index cc49ebd2a773..f378613ad120 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -1093,14 +1093,15 @@ static int unregister_fprobe_nolock(struct fprobe *fp) } /** - * unregister_fprobe() - Unregister fprobe. + * unregister_fprobe_async() - Unregister fprobe without RCU GP wait * @fp: A fprobe data structure to be unregistered. * * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will NOT wait until the fprobe is no longer used. * * Return 0 if @fp is unregistered successfully, -errno if not. */ -int unregister_fprobe(struct fprobe *fp) +int unregister_fprobe_async(struct fprobe *fp) { guard(mutex)(&fprobe_mutex); if (!fp || !fprobe_registered(fp)) @@ -1108,6 +1109,24 @@ int unregister_fprobe(struct fprobe *fp) return unregister_fprobe_nolock(fp); } + +/** + * unregister_fprobe() - Unregister fprobe with RCU GP wait + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * This function will block until the fprobe is no longer used. + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret = unregister_fprobe_async(fp); + + if (!ret) + synchronize_rcu(); + return ret; +} EXPORT_SYMBOL_GPL(unregister_fprobe); static int __init fprobe_initcall(void) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b2611de3f594..f93e34dd2328 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1198,8 +1198,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) return __ftrace_lookup_ip(hash, ip); } -static void __add_hash_entry(struct ftrace_hash *hash, - struct ftrace_func_entry *entry) +void add_ftrace_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { struct hlist_head *hhd; unsigned long key; @@ -1221,7 +1220,7 @@ add_ftrace_hash_entry_direct(struct ftrace_hash *hash, unsigned long ip, unsigne entry->ip = ip; entry->direct = direct; - __add_hash_entry(hash, entry); + add_ftrace_hash_entry(hash, entry); return entry; } @@ -1249,6 +1248,25 @@ remove_hash_entry(struct ftrace_hash *hash, hash->count--; } +void ftrace_hash_remove(struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct hlist_head *hhd; + struct hlist_node *tn; + int size; + int i; + + if (!hash || !hash->count) + return; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hhd = &hash->buckets[i]; + hlist_for_each_entry_safe(entry, tn, hhd, hlist) + remove_hash_entry(hash, entry); + } + FTRACE_WARN_ON(hash->count); +} + static void ftrace_hash_clear(struct ftrace_hash *hash) { struct hlist_head *hhd; @@ -1458,7 +1476,7 @@ static struct ftrace_hash *__move_hash(struct ftrace_hash *src, int size) hhd = &src->buckets[i]; hlist_for_each_entry_safe(entry, tn, hhd, hlist) { remove_hash_entry(src, entry); - __add_hash_entry(new_hash, entry); + add_ftrace_hash_entry(new_hash, entry); } } return new_hash; @@ -5341,7 +5359,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, map->entry.ip = ip; map->data = data; - __add_hash_entry(&mapper->hash, &map->entry); + add_ftrace_hash_entry(&mapper->hash, &map->entry); return 0; } @@ -6288,11 +6306,16 @@ int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) } EXPORT_SYMBOL_GPL(modify_ftrace_direct); -static unsigned long hash_count(struct ftrace_hash *hash) +static inline unsigned long hash_count(struct ftrace_hash *hash) { return hash ? hash->count : 0; } +unsigned long ftrace_hash_count(struct ftrace_hash *hash) +{ + return hash_count(hash); +} + /** * hash_add - adds two struct ftrace_hash and returns the result * @a: struct ftrace_hash object diff --git a/kernel/trace/remote_test.c b/kernel/trace/remote_test.c index 6c1b7701ddae..a3e2c9b606eb 100644 --- a/kernel/trace/remote_test.c +++ b/kernel/trace/remote_test.c @@ -110,9 +110,9 @@ static struct trace_buffer_desc *remote_test_load(unsigned long size, void *unus return remote_test_buffer_desc; err_unload: - for_each_ring_buffer_desc(rb_desc, cpu, remote_test_buffer_desc) + for_each_ring_buffer_desc(rb_desc, cpu, desc) remote_test_unload_simple_rb(rb_desc->cpu); - trace_remote_free_buffer(remote_test_buffer_desc); + trace_remote_free_buffer(desc); err_free_desc: kfree(desc); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5326924615a4..ebae64ec2f11 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7,6 +7,7 @@ #include <linux/ring_buffer_types.h> #include <linux/sched/isolation.h> #include <linux/trace_recursion.h> +#include <linux/panic_notifier.h> #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> @@ -31,6 +32,7 @@ #include <linux/oom.h> #include <linux/mm.h> +#include <asm/ring_buffer.h> #include <asm/local64.h> #include <asm/local.h> #include <asm/setup.h> @@ -559,6 +561,7 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; + struct notifier_block flush_nb; struct ring_buffer_meta *meta; @@ -2521,6 +2524,16 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } +/* Stop recording on a persistent buffer and flush cache if needed. */ +static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data) +{ + struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb); + + ring_buffer_record_off(buffer); + arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end); + return NOTIFY_DONE; +} + static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, @@ -2651,6 +2664,12 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); + /* Persistent ring buffer needs to flush cache before reboot. */ + if (start && end) { + buffer->flush_nb.notifier_call = rb_flush_buffer_cb; + atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb); + } + return_ptr(buffer); fail_free_buffers: @@ -2749,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer) { int cpu; + if (buffer->range_addr_start && buffer->range_addr_end) + atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb); + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); irq_work_sync(&buffer->irq_work.work); @@ -3769,13 +3791,6 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, return skip_time_extend(event); } -#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static inline bool sched_clock_stable(void) -{ - return true; -} -#endif - static void rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) @@ -5407,6 +5422,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; iter->next_event = iter->head; + iter->missed_events = 0; iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; @@ -6086,10 +6102,7 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, */ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) { - bool ret = iter->missed_events != 0; - - iter->missed_events = 0; - return ret; + return iter->missed_events != 0; } EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); @@ -6251,7 +6264,7 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter) unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - + iter->missed_events = 0; rb_advance_iter(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); diff --git a/kernel/trace/rv/monitors/deadline/deadline.h b/kernel/trace/rv/monitors/deadline/deadline.h index 0bbfd2543329..78fca873d61e 100644 --- a/kernel/trace/rv/monitors/deadline/deadline.h +++ b/kernel/trace/rv/monitors/deadline/deadline.h @@ -95,7 +95,8 @@ static inline u8 get_server_type(struct task_struct *tsk) static inline int extract_params(struct pt_regs *regs, long id, pid_t *pid_out) { size_t size = offsetofend(struct sched_attr, sched_flags); - struct sched_attr __user *uattr, attr; + struct sched_attr __user *uattr; + struct sched_attr attr; int new_policy = -1, ret; unsigned long args[6]; diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.c b/kernel/trace/rv/monitors/nomiss/nomiss.c index 31f90f3638d8..8ead8783c29f 100644 --- a/kernel/trace/rv/monitors/nomiss/nomiss.c +++ b/kernel/trace/rv/monitors/nomiss/nomiss.c @@ -227,7 +227,7 @@ static int enable_nomiss(void) { int retval; - retval = da_monitor_init(); + retval = ha_monitor_init(); if (retval) return retval; @@ -263,7 +263,7 @@ static void disable_nomiss(void) rv_detach_trace_probe("nomiss", sched_switch, handle_sched_switch); rv_detach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup); - da_monitor_destroy(); + ha_monitor_destroy(); } static struct rv_monitor rv_this = { diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c index 4594c7c46601..3b6a85e815b8 100644 --- a/kernel/trace/rv/monitors/opid/opid.c +++ b/kernel/trace/rv/monitors/opid/opid.c @@ -22,14 +22,8 @@ static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs_opid env, u64 time_ns if (env == irq_off_opid) return irqs_disabled(); else if (env == preempt_off_opid) { - /* - * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables - * preemption (adding one to the preempt_count). Since we are - * interested in the preempt_count at the time the tracepoint was - * hit, we consider 1 as still enabled. - */ if (IS_ENABLED(CONFIG_PREEMPTION)) - return (preempt_count() & PREEMPT_MASK) > 1; + return (preempt_count() & PREEMPT_MASK) > 0; return true; } return ENV_INVALID_VALUE; @@ -73,7 +67,7 @@ static int enable_opid(void) { int retval; - retval = da_monitor_init(); + retval = ha_monitor_init(); if (retval) return retval; @@ -90,7 +84,7 @@ static void disable_opid(void) rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); rv_detach_trace_probe("opid", sched_waking, handle_sched_waking); - da_monitor_destroy(); + ha_monitor_destroy(); } /* diff --git a/kernel/trace/rv/monitors/stall/stall.c b/kernel/trace/rv/monitors/stall/stall.c index 9ccfda6b0e73..3c38fb1a0159 100644 --- a/kernel/trace/rv/monitors/stall/stall.c +++ b/kernel/trace/rv/monitors/stall/stall.c @@ -103,7 +103,7 @@ static int enable_stall(void) { int retval; - retval = da_monitor_init(); + retval = ha_monitor_init(); if (retval) return retval; @@ -120,7 +120,7 @@ static void disable_stall(void) rv_detach_trace_probe("stall", sched_switch, handle_sched_switch); rv_detach_trace_probe("stall", sched_wakeup, handle_sched_wakeup); - da_monitor_destroy(); + ha_monitor_destroy(); } static struct rv_monitor rv_this = { diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c index 02af2297ae5a..f4642f5adda3 100644 --- a/kernel/trace/simple_ring_buffer.c +++ b/kernel/trace/simple_ring_buffer.c @@ -395,7 +395,6 @@ int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); cpu_buffer->meta->meta_page_size = PAGE_SIZE; - cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; /* The reader page is not part of the ring initially */ page = load_page(desc->page_va[0]); @@ -431,12 +430,13 @@ int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, if (ret) { for (i--; i >= 0; i--) - unload_page((void *)desc->page_va[i]); + unload_page(bpages[i].page); unload_page(cpu_buffer->meta); return ret; } + cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; /* Close the ring */ bpage->link.next = &cpu_buffer->tail_page->link; cpu_buffer->tail_page->link.prev = &bpage->link; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0dbbf6cca9bc..eb2c2bc8bc3d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1369,10 +1369,8 @@ static const char *hist_field_name(struct hist_field *field, len = snprintf(full_name, sizeof(full_name), fmt, field->system, field->event_name, field->name); - if (len >= sizeof(full_name)) - return NULL; - - field_name = full_name; + if (len < sizeof(full_name)) + field_name = full_name; } else field_name = field->name; } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 75678053b21c..5e83c4f6f2b4 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -83,6 +83,22 @@ struct osnoise_instance { static struct list_head osnoise_instances; +static void osnoise_print(const char *fmt, ...) +{ + struct osnoise_instance *inst; + struct trace_array *tr; + va_list ap; + + rcu_read_lock(); + list_for_each_entry_rcu(inst, &osnoise_instances, list) { + tr = inst->tr; + va_start(ap, fmt); + trace_array_vprintk(tr, _RET_IP_, fmt, ap); + va_end(ap); + } + rcu_read_unlock(); +} + static bool osnoise_has_registered_instances(void) { return !!list_first_or_null_rcu(&osnoise_instances, @@ -123,6 +139,7 @@ static int osnoise_register_instance(struct trace_array *tr) * trace_types_lock. */ lockdep_assert_held(&trace_types_lock); + trace_array_init_printk(tr); inst = kmalloc_obj(*inst); if (!inst) @@ -471,15 +488,7 @@ static void print_osnoise_headers(struct seq_file *s) * osnoise_taint - report an osnoise error. */ #define osnoise_taint(msg) ({ \ - struct osnoise_instance *inst; \ - struct trace_buffer *buffer; \ - \ - rcu_read_lock(); \ - list_for_each_entry_rcu(inst, &osnoise_instances, list) { \ - buffer = inst->tr->array_buffer.buffer; \ - trace_array_printk_buf(buffer, _THIS_IP_, msg); \ - } \ - rcu_read_unlock(); \ + osnoise_print(msg); \ osnoise_data.tainted = true; \ }) @@ -1189,10 +1198,10 @@ static __always_inline void osnoise_stop_exception(char *msg, int cpu) rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { tr = inst->tr; - trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, - "stop tracing hit on cpu %d due to exception: %s\n", - smp_processor_id(), - msg); + trace_array_printk(tr, _THIS_IP_, + "stop tracing hit on cpu %d due to exception: %s\n", + smp_processor_id(), + msg); if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) panic("tracer hit on cpu %d due to exception: %s\n", @@ -1362,8 +1371,8 @@ static __always_inline void osnoise_stop_tracing(void) rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { tr = inst->tr; - trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, - "stop tracing hit on cpu %d\n", smp_processor_id()); + trace_array_printk(tr, _THIS_IP_, + "stop tracing hit on cpu %d\n", smp_processor_id()); if (test_bit(OSN_PANIC_ON_STOP, &osnoise_options)) panic("tracer hit stop condition on CPU %d\n", smp_processor_id()); @@ -2544,9 +2553,12 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, notify_new_max_latency(diff); tlat->tracing_thread = false; - if (osnoise_data.stop_tracing_total) - if (time_to_us(diff) >= osnoise_data.stop_tracing_total) + if (osnoise_data.stop_tracing_total) { + if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { + timerlat_dump_stack(time_to_us(diff)); osnoise_stop_tracing(); + } + } } else { tlat->tracing_thread = false; tlat->kthread = current; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index e0d3a0da26af..fd1caa1f9723 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -332,6 +332,23 @@ static int parse_trace_event_arg(char *arg, struct fetch_insn *code, return -ENOENT; } +static int parse_trace_event(char *arg, struct fetch_insn *code, + struct traceprobe_parse_context *ctx) +{ + int ret; + + if (code->data) + return -EFAULT; + ret = parse_trace_event_arg(arg, code, ctx); + if (!ret) + return 0; + if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { + code->op = FETCH_OP_COMM; + return 0; + } + return -EINVAL; +} + #ifdef CONFIG_PROBE_EVENTS_BTF_ARGS static u32 btf_type_int(const struct btf_type *t) @@ -376,11 +393,16 @@ static bool btf_type_is_char_array(struct btf *btf, const struct btf_type *type) && BTF_INT_BITS(intdata) == 8; } +static struct btf *ctx_btf(struct traceprobe_parse_context *ctx) +{ + return ctx->struct_btf ? : ctx->btf; +} + static int check_prepare_btf_string_fetch(char *typename, struct fetch_insn **pcode, struct traceprobe_parse_context *ctx) { - struct btf *btf = ctx->btf; + struct btf *btf = ctx_btf(ctx); if (!btf || !ctx->last_type) return 0; @@ -506,6 +528,15 @@ static int query_btf_context(struct traceprobe_parse_context *ctx) return 0; } +static void clear_struct_btf(struct traceprobe_parse_context *ctx) +{ + if (ctx->struct_btf) { + btf_put(ctx->struct_btf); + ctx->struct_btf = NULL; + ctx->last_struct = NULL; + } +} + static void clear_btf_context(struct traceprobe_parse_context *ctx) { if (ctx->btf) { @@ -554,22 +585,29 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type, struct fetch_insn *code = *pcode; const struct btf_member *field; u32 bitoffs, anon_offs; + bool is_struct = ctx->struct_btf != NULL; + struct btf *btf = ctx_btf(ctx); char *next; int is_ptr; s32 tid; do { - /* Outer loop for solving arrow operator ('->') */ - if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) { - trace_probe_log_err(ctx->offset, NO_PTR_STRCT); - return -EINVAL; - } - /* Convert a struct pointer type to a struct type */ - type = btf_type_skip_modifiers(ctx->btf, type->type, &tid); - if (!type) { - trace_probe_log_err(ctx->offset, BAD_BTF_TID); - return -EINVAL; + if (!is_struct) { + /* Outer loop for solving arrow operator ('->') */ + if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) { + trace_probe_log_err(ctx->offset, NO_PTR_STRCT); + return -EINVAL; + } + + /* Convert a struct pointer type to a struct type */ + type = btf_type_skip_modifiers(btf, type->type, &tid); + if (!type) { + trace_probe_log_err(ctx->offset, BAD_BTF_TID); + return -EINVAL; + } } + /* Only the first type can skip being a pointer */ + is_struct = false; bitoffs = 0; do { @@ -580,7 +618,7 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type, return is_ptr; anon_offs = 0; - field = btf_find_struct_member(ctx->btf, type, fieldname, + field = btf_find_struct_member(btf, type, fieldname, &anon_offs); if (IS_ERR(field)) { trace_probe_log_err(ctx->offset, BAD_BTF_TID); @@ -602,7 +640,7 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type, ctx->last_bitsize = 0; } - type = btf_type_skip_modifiers(ctx->btf, field->type, &tid); + type = btf_type_skip_modifiers(btf, field->type, &tid); if (!type) { trace_probe_log_err(ctx->offset, BAD_BTF_TID); return -EINVAL; @@ -640,7 +678,7 @@ static int parse_btf_arg(char *varname, int i, is_ptr, ret; u32 tid; - if (WARN_ON_ONCE(!ctx->funcname)) + if (WARN_ON_ONCE(!ctx->funcname && !(ctx->flags & TPARG_FL_TEVENT))) return -EINVAL; is_ptr = split_next_field(varname, &field, ctx); @@ -653,6 +691,19 @@ static int parse_btf_arg(char *varname, return -EOPNOTSUPP; } + if (ctx->flags & TPARG_FL_TEVENT) { + ret = parse_trace_event(varname, code, ctx); + if (ret < 0) { + trace_probe_log_err(ctx->offset, BAD_ATTACH_ARG); + return ret; + } + /* TEVENT is only here via a typecast */ + if (WARN_ON_ONCE(ctx->struct_btf == NULL)) + return -EINVAL; + type = ctx->last_struct; + goto found_type; + } + if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) { code->op = FETCH_OP_RETVAL; /* Check whether the function return type is not void */ @@ -709,6 +760,7 @@ static int parse_btf_arg(char *varname, found: type = btf_type_skip_modifiers(ctx->btf, tid, &tid); +found_type: if (!type) { trace_probe_log_err(ctx->offset, BAD_BTF_TID); return -EINVAL; @@ -727,7 +779,7 @@ found: static const struct fetch_type *find_fetch_type_from_btf_type( struct traceprobe_parse_context *ctx) { - struct btf *btf = ctx->btf; + struct btf *btf = ctx_btf(ctx); const char *typestr = NULL; if (btf && ctx->last_type) @@ -758,7 +810,67 @@ static int parse_btf_bitfield(struct fetch_insn **pcode, return 0; } -#else +static int query_btf_struct(const char *sname, struct traceprobe_parse_context *ctx) +{ + struct btf *btf = NULL; + int id; + + /* A struct_btf should only be used by a single argument */ + if (WARN_ON_ONCE(ctx->struct_btf)) { + btf_put(ctx->struct_btf); + ctx->struct_btf = NULL; + } + + id = bpf_find_btf_id(sname, BTF_KIND_STRUCT, &btf); + if (id < 0) + return id; + ctx->struct_btf = btf; + ctx->last_struct = btf_type_by_id(ctx->struct_btf, id); + return 0; +} + +static int handle_typecast(char *arg, struct fetch_insn **pcode, + struct fetch_insn *end, + struct traceprobe_parse_context *ctx) +{ + char *tmp; + int ret; + + /* Currently this only works for eprobes */ + if (!(ctx->flags & TPARG_FL_TEVENT)) { + trace_probe_log_err(ctx->offset, TYPECAST_NOT_EVENT); + return -EINVAL; + } + + tmp = strchr(arg, ')'); + if (!tmp) { + trace_probe_log_err(ctx->offset + strlen(arg), + DEREF_OPEN_BRACE); + return -EINVAL; + } + *tmp = '\0'; + ret = query_btf_struct(arg + 1, ctx); + *tmp = ')'; + + if (ret < 0) { + trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT); + return -EINVAL; + } + + tmp++; + + ctx->offset += tmp - arg; + ret = parse_btf_arg(tmp, pcode, end, ctx); + return ret; +} + +#else /* !CONFIG_PROBE_EVENTS_BTF_ARGS */ + +static void clear_struct_btf(struct traceprobe_parse_context *ctx) +{ + ctx->struct_btf = NULL; +} + static void clear_btf_context(struct traceprobe_parse_context *ctx) { ctx->btf = NULL; @@ -794,7 +906,15 @@ static int check_prepare_btf_string_fetch(char *typename, return 0; } -#endif +static int handle_typecast(char *arg, struct fetch_insn **pcode, + struct fetch_insn *end, + struct traceprobe_parse_context *ctx) +{ + trace_probe_log_err(ctx->offset, NOSUP_BTFARG); + return -EOPNOTSUPP; +} + +#endif /* CONFIG_PROBE_EVENTS_BTF_ARGS */ #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API @@ -838,15 +958,10 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum) int i, offset, last_offset = 0; if (!earg) { - earg = kzalloc_obj(*tp->entry_arg); + earg = kzalloc_flex(*earg, code, 2 * tp->nr_args + 1); if (!earg) return -ENOMEM; earg->size = 2 * tp->nr_args + 1; - earg->code = kzalloc_objs(struct fetch_insn, earg->size); - if (!earg->code) { - kfree(earg); - return -ENOMEM; - } /* Fill the code buffer with 'end' to simplify it */ for (i = 0; i < earg->size; i++) earg->code[i].op = FETCH_OP_END; @@ -953,18 +1068,9 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t, int len; if (ctx->flags & TPARG_FL_TEVENT) { - if (code->data) - return -EFAULT; - ret = parse_trace_event_arg(arg, code, ctx); - if (!ret) - return 0; - if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) { - code->op = FETCH_OP_COMM; - return 0; - } - /* backward compatibility */ - ctx->offset = 0; - goto inval; + if (parse_trace_event(arg, code, ctx) < 0) + goto inval; + return 0; } if (str_has_prefix(arg, "retval")) { @@ -1231,6 +1337,9 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->op = FETCH_OP_IMM; } break; + case '(': + ret = handle_typecast(arg, pcode, end, ctx); + break; default: if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */ if (!tparg_is_function_entry(ctx->flags) && @@ -1563,6 +1672,9 @@ fail: } kfree(tmp); + /* struct_btf should not be passed to other arguments */ + clear_struct_btf(ctx); + return ret; } @@ -2051,7 +2163,6 @@ void trace_probe_cleanup(struct trace_probe *tp) traceprobe_free_probe_arg(&tp->args[i]); if (tp->entry_arg) { - kfree(tp->entry_arg->code); kfree(tp->entry_arg); tp->entry_arg = NULL; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 262d8707a3df..15758cc11fc6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -238,8 +238,8 @@ struct probe_arg { }; struct probe_entry_arg { - struct fetch_insn *code; unsigned int size; /* The entry data size */ + struct fetch_insn code[] __counted_by(size); }; struct trace_uprobe_filter { @@ -422,7 +422,9 @@ struct traceprobe_parse_context { const struct btf_param *params; /* Parameter of the function */ s32 nr_params; /* The number of the parameters */ struct btf *btf; /* The BTF to be used */ + struct btf *struct_btf; /* The BTF to be used for structs */ const struct btf_type *last_type; /* Saved type */ + const struct btf_type *last_struct; /* Saved structure */ u32 last_bitoffs; /* Saved bitoffs */ u32 last_bitsize; /* Saved bitsize */ struct trace_probe *tp; @@ -563,7 +565,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ C(TOO_MANY_ARGS, "Too many arguments are specified"), \ C(TOO_MANY_EARGS, "Too many entry arguments specified"), \ - C(EVENT_TOO_BIG, "Event too big (too many fields?)"), + C(EVENT_TOO_BIG, "Event too big (too many fields?)"), \ + C(TYPECAST_NOT_EVENT, "Typecasts are only for eprobe fields"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8ad72e17d8eb..e98ee7e1e66f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1371,33 +1371,33 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; -static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, +static int perf_call_bpf_enter(struct trace_event_call *call, struct syscall_metadata *sys_data, - struct syscall_trace_enter *rec) + int syscall_nr, unsigned long *args) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long args[SYSCALL_DEFINE_MAXARGS]; } __aligned(8) param; + struct pt_regs regs = {}; int i; BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); - /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ - perf_fetch_caller_regs(regs); - *(struct pt_regs **)¶m = regs; - param.syscall_nr = rec->nr; + /* bpf prog requires 'regs' to be the first member in the ctx */ + perf_fetch_caller_regs(®s); + *(struct pt_regs **)¶m = ®s; + param.syscall_nr = syscall_nr; for (i = 0; i < sys_data->nb_args; i++) - param.args[i] = rec->args[i]; - return trace_call_bpf(call, ¶m); + param.args[i] = args[i]; + return trace_call_bpf_faultable(call, ¶m); } static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; - struct pt_regs *fake_regs; struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; @@ -1410,12 +1410,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size = 0; int uargs = 0; - /* - * Syscall probe called with preemption enabled, but the ring - * buffer and per-cpu data require preemption to be disabled. - */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -1429,6 +1424,26 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, args); + /* + * Run BPF program in faultable context before per-cpu buffer + * allocation, allowing sleepable BPF programs to execute. + */ + valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); + if (valid_prog_array && + !perf_call_bpf_enter(sys_data->enter_event, sys_data, + syscall_nr, args)) + return; + + /* + * Per-cpu ring buffer and perf event list operations require + * preemption to be disabled. + */ + guard(preempt_notrace)(); + + head = this_cpu_ptr(sys_data->enter_event->perf_events); + if (hlist_empty(head)) + return; + /* Check if this syscall event faults in user space memory */ mayfault = sys_data->user_mask != 0; @@ -1438,17 +1453,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; } - head = this_cpu_ptr(sys_data->enter_event->perf_events); - valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); - if (!valid_prog_array && hlist_empty(head)) - return; - /* get the size after alignment with the u32 buffer size field */ size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; @@ -1458,13 +1468,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (mayfault) syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); - if ((valid_prog_array && - !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || - hlist_empty(head)) { - perf_swevent_put_recursion_context(rctx); - return; - } - perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, head, NULL); @@ -1514,40 +1517,35 @@ static void perf_sysenter_disable(struct trace_event_call *call) syscall_fault_buffer_disable(); } -static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, - struct syscall_trace_exit *rec) +static int perf_call_bpf_exit(struct trace_event_call *call, + int syscall_nr, long ret_val) { struct syscall_tp_t { struct trace_entry ent; int syscall_nr; unsigned long ret; } __aligned(8) param; - - /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ - perf_fetch_caller_regs(regs); - *(struct pt_regs **)¶m = regs; - param.syscall_nr = rec->nr; - param.ret = rec->ret; - return trace_call_bpf(call, ¶m); + struct pt_regs regs = {}; + + /* bpf prog requires 'regs' to be the first member in the ctx */ + perf_fetch_caller_regs(®s); + *(struct pt_regs **)¶m = ®s; + param.syscall_nr = syscall_nr; + param.ret = ret_val; + return trace_call_bpf_faultable(call, ¶m); } static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; - struct pt_regs *fake_regs; struct hlist_head *head; bool valid_prog_array; int syscall_nr; int rctx; int size; - /* - * Syscall probe called with preemption enabled, but the ring - * buffer and per-cpu data require preemption to be disabled. - */ might_fault(); - guard(preempt_notrace)(); syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) @@ -1559,29 +1557,37 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; - head = this_cpu_ptr(sys_data->exit_event->perf_events); + /* + * Run BPF program in faultable context before per-cpu buffer + * allocation, allowing sleepable BPF programs to execute. + */ valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); - if (!valid_prog_array && hlist_empty(head)) + if (valid_prog_array && + !perf_call_bpf_exit(sys_data->exit_event, syscall_nr, + syscall_get_return_value(current, regs))) + return; + + /* + * Per-cpu ring buffer and perf event list operations require + * preemption to be disabled. + */ + guard(preempt_notrace)(); + + head = this_cpu_ptr(sys_data->exit_event->perf_events); + if (hlist_empty(head)) return; /* We can probably do that at build time */ size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); + rec = perf_trace_buf_alloc(size, NULL, &rctx); if (!rec) return; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - if ((valid_prog_array && - !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) || - hlist_empty(head)) { - perf_swevent_put_recursion_context(rctx); - return; - } - perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, 1, regs, head, NULL); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2cabf8a23ec5..c274346853d1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -912,7 +912,7 @@ static int uprobe_buffer_enable(void) { int ret = 0; - BUG_ON(!mutex_is_locked(&event_mutex)); + lockdep_assert_held(&event_mutex); if (uprobe_buffer_refcnt++ == 0) { ret = uprobe_buffer_init(); @@ -927,7 +927,7 @@ static void uprobe_buffer_disable(void) { int cpu; - BUG_ON(!mutex_is_locked(&event_mutex)); + lockdep_assert_held(&event_mutex); if (--uprobe_buffer_refcnt == 0) { for_each_possible_cpu(cpu) @@ -979,6 +979,7 @@ static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, ucb = uprobe_buffer_get(); ucb->dsize = tu->tp.size + dsize; + BUILD_BUG_ON(MAX_UCB_BUFFER_SIZE < MAX_PROBE_EVENT_SIZE); if (WARN_ON_ONCE(ucb->dsize > MAX_UCB_BUFFER_SIZE)) { ucb->dsize = MAX_UCB_BUFFER_SIZE; dsize = MAX_UCB_BUFFER_SIZE - tu->tp.size; diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index bf1a507695b6..0dd7927df22a 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -386,13 +386,11 @@ static void tracing_map_elt_init_fields(struct tracing_map_elt *elt) } } -static void tracing_map_elt_free(struct tracing_map_elt *elt) +static void __tracing_map_elt_free(struct tracing_map_elt *elt) { if (!elt) return; - if (elt->map->ops && elt->map->ops->elt_free) - elt->map->ops->elt_free(elt); kfree(elt->fields); kfree(elt->vars); kfree(elt->var_set); @@ -400,6 +398,17 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt) kfree(elt); } +static void tracing_map_elt_free(struct tracing_map_elt *elt) +{ + if (!elt) + return; + + /* Only objects initialized with alloc_elt() should be passed to free_elt().*/ + if (elt->map->ops && elt->map->ops->elt_free) + elt->map->ops->elt_free(elt); + __tracing_map_elt_free(elt); +} + static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) { struct tracing_map_elt *elt; @@ -444,7 +453,7 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) } return elt; free: - tracing_map_elt_free(elt); + __tracing_map_elt_free(elt); return ERR_PTR(err); } diff --git a/kernel/umh.c b/kernel/umh.c index cffda97d961c..48117c569e1a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -430,7 +430,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; - queue_work(system_unbound_wq, &sub_info->work); + queue_work(system_dfl_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5f747f241a5f..78068ae8f28a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2281,6 +2281,14 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, unsigned int req_cpu = cpu; /* + * NOTE: Check whether the used workqueue is deprecated and warn + */ + if (unlikely(wq->flags & __WQ_DEPRECATED)) + pr_warn_once("workqueue: work func %ps enqueued on deprecated workqueue. " + "Use system_{percpu|dfl}_wq instead.\n", + work->func); + + /* * While a work item is PENDING && off queue, a task trying to * steal the PENDING will busy-loop waiting for it to either get * queued or lose PENDING. Grabbing PENDING and queueing should @@ -2296,6 +2304,18 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n", work->func, wq->name))) { + struct work_offq_data offqd; + + /* + * State on entry: PENDING is set, work is off-queue (no + * insert_work() has run). + * + * Returning without clearing PENDING would leave the work + * in a weird state (PENDING=1, PWQ=0, entry empty) + */ + work_offqd_unpack(&offqd, *work_data_bits(work)); + set_work_pool_and_clear_pending(work, offqd.pool_id, + work_offqd_pack_flags(&offqd)); return; } rcu_read_lock(); @@ -5300,16 +5320,6 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, return pwq; } -static void apply_wqattrs_lock(void) -{ - mutex_lock(&wq_pool_mutex); -} - -static void apply_wqattrs_unlock(void) -{ - mutex_unlock(&wq_pool_mutex); -} - /** * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod * @attrs: the wq_attrs of the default pwq of the target workqueue @@ -5642,7 +5652,9 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]); } - return ret; + if (ret) + goto enomem; + return 0; enomem: if (wq->cpu_pwq) { @@ -5804,7 +5816,7 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt, /* see the comment above the definition of WQ_POWER_EFFICIENT */ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) - flags |= WQ_UNBOUND; + flags = (flags & ~WQ_PERCPU) | WQ_UNBOUND; /* allocate wq and format name */ if (flags & WQ_UNBOUND) @@ -5828,6 +5840,23 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt, pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", wq->name); + /* + * One among WQ_PERCPU and WQ_UNBOUND must be set, but not both. + * - If neither is set, default to WQ_PERCPU + * - If both are set, default to WQ_UNBOUND + * + * This code can be removed after workqueue are unbound by default + */ + if (unlikely(!(flags & (WQ_UNBOUND | WQ_PERCPU)))) { + WARN_ONCE(1, "workqueue: %s is using neither WQ_PERCPU or WQ_UNBOUND. " + "Setting WQ_PERCPU.\n", wq->name); + flags |= WQ_PERCPU; + } else if (unlikely((flags & WQ_PERCPU) && (flags & WQ_UNBOUND))) { + WARN_ONCE(1, "workqueue: %s uses both WQ_PERCPU and WQ_UNBOUND. " + "Dropped WQ_PERCPU, keeping WQ_UNBOUND.\n", wq->name); + flags &= ~WQ_PERCPU; + } + if (flags & WQ_BH) { /* * BH workqueues always share a single execution context per CPU @@ -5863,7 +5892,7 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt, * wq_pool_mutex protects the workqueues list, allocations of PWQs, * and the global freeze state. */ - apply_wqattrs_lock(); + mutex_lock(&wq_pool_mutex); if (alloc_and_link_pwqs(wq) < 0) goto err_unlock_free_node_nr_active; @@ -5877,7 +5906,7 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt, if (wq_online && init_rescuer(wq) < 0) goto err_unlock_destroy; - apply_wqattrs_unlock(); + mutex_unlock(&wq_pool_mutex); if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) goto err_destroy; @@ -5885,7 +5914,7 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt, return wq; err_unlock_free_node_nr_active: - apply_wqattrs_unlock(); + mutex_unlock(&wq_pool_mutex); /* * Failed alloc_and_link_pwqs() may leave pending pwq->release_work, * flushing the pwq_release_worker ensures that the pwq_release_workfn() @@ -5900,12 +5929,27 @@ err_free_wq: kfree(wq); return NULL; err_unlock_destroy: - apply_wqattrs_unlock(); + mutex_unlock(&wq_pool_mutex); err_destroy: destroy_workqueue(wq); return NULL; } +__printf(1, 0) +static struct workqueue_struct *alloc_workqueue_va(const char *fmt, + unsigned int flags, + int max_active, + va_list args) +{ + struct workqueue_struct *wq; + + wq = __alloc_workqueue(fmt, flags, max_active, args); + if (wq) + wq_init_lockdep(wq); + + return wq; +} + __printf(1, 4) struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, unsigned int flags, @@ -5915,12 +5959,8 @@ struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, va_list args; va_start(args, max_active); - wq = __alloc_workqueue(fmt, flags, max_active, args); + wq = alloc_workqueue_va(fmt, flags, max_active, args); va_end(args); - if (!wq) - return NULL; - - wq_init_lockdep(wq); return wq; } @@ -5932,15 +5972,15 @@ static void devm_workqueue_release(void *res) } __printf(2, 5) struct workqueue_struct * -devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, - int max_active, ...) +devm_alloc_workqueue_noprof(struct device *dev, const char *fmt, + unsigned int flags, int max_active, ...) { struct workqueue_struct *wq; va_list args; int ret; va_start(args, max_active); - wq = alloc_workqueue(fmt, flags, max_active, args); + wq = alloc_workqueue_va(fmt, flags, max_active, args); va_end(args); if (!wq) return NULL; @@ -5951,7 +5991,7 @@ devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, return wq; } -EXPORT_SYMBOL_GPL(devm_alloc_workqueue); +EXPORT_SYMBOL_GPL(devm_alloc_workqueue_noprof); #ifdef CONFIG_LOCKDEP __printf(1, 5) @@ -6285,7 +6325,7 @@ EXPORT_SYMBOL_GPL(set_worker_desc); */ void print_worker_info(const char *log_lvl, struct task_struct *task) { - work_func_t *fn = NULL; + work_func_t fn = NULL; char name[WQ_NAME_LEN] = { }; char desc[WORKER_DESC_LEN] = { }; struct pool_workqueue *pwq = NULL; @@ -7290,7 +7330,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, struct workqueue_attrs *attrs; int ret = -ENOMEM; - apply_wqattrs_lock(); + mutex_lock(&wq_pool_mutex); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) @@ -7303,7 +7343,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, ret = -EINVAL; out_unlock: - apply_wqattrs_unlock(); + mutex_unlock(&wq_pool_mutex); free_workqueue_attrs(attrs); return ret ?: count; } @@ -7329,7 +7369,7 @@ static ssize_t wq_cpumask_store(struct device *dev, struct workqueue_attrs *attrs; int ret = -ENOMEM; - apply_wqattrs_lock(); + mutex_lock(&wq_pool_mutex); attrs = wq_sysfs_prep_attrs(wq); if (!attrs) @@ -7340,7 +7380,7 @@ static ssize_t wq_cpumask_store(struct device *dev, ret = apply_workqueue_attrs_locked(wq, attrs); out_unlock: - apply_wqattrs_unlock(); + mutex_unlock(&wq_pool_mutex); free_workqueue_attrs(attrs); return ret ?: count; } @@ -7376,13 +7416,13 @@ static ssize_t wq_affn_scope_store(struct device *dev, if (affn < 0) return affn; - apply_wqattrs_lock(); + mutex_lock(&wq_pool_mutex); attrs = wq_sysfs_prep_attrs(wq); if (attrs) { attrs->affn_scope = affn; ret = apply_workqueue_attrs_locked(wq, attrs); } - apply_wqattrs_unlock(); + mutex_unlock(&wq_pool_mutex); free_workqueue_attrs(attrs); return ret ?: count; } @@ -7407,13 +7447,13 @@ static ssize_t wq_affinity_strict_store(struct device *dev, if (sscanf(buf, "%d", &v) != 1) return -EINVAL; - apply_wqattrs_lock(); + mutex_lock(&wq_pool_mutex); attrs = wq_sysfs_prep_attrs(wq); if (attrs) { attrs->affn_strict = (bool)v; ret = apply_workqueue_attrs_locked(wq, attrs); } - apply_wqattrs_unlock(); + mutex_unlock(&wq_pool_mutex); free_workqueue_attrs(attrs); return ret ?: count; } @@ -7454,12 +7494,12 @@ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { ret = 0; - apply_wqattrs_lock(); + mutex_lock(&wq_pool_mutex); if (!cpumask_equal(cpumask, wq_unbound_cpumask)) ret = workqueue_apply_unbound_cpumask(cpumask); if (!ret) cpumask_copy(wq_requested_unbound_cpumask, cpumask); - apply_wqattrs_unlock(); + mutex_unlock(&wq_pool_mutex); } return ret; @@ -8012,12 +8052,12 @@ void __init workqueue_init_early(void) ordered_wq_attrs[i] = attrs; } - system_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_wq = alloc_workqueue("events", WQ_PERCPU | __WQ_DEPRECATED, 0); system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0); system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI | WQ_PERCPU, 0); system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0); - system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); + system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND | __WQ_DEPRECATED, WQ_MAX_ACTIVE); system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE | WQ_PERCPU, 0); @@ -8187,11 +8227,7 @@ static bool __init cpus_dont_share(int cpu0, int cpu1) static bool __init cpus_share_smt(int cpu0, int cpu1) { -#ifdef CONFIG_SCHED_SMT return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); -#else - return false; -#endif } static bool __init cpus_share_numa(int cpu0, int cpu1) |
