diff options
Diffstat (limited to 'kernel')
235 files changed, 10043 insertions, 4915 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 87866b037fbe..434929de17ef 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,6 +21,11 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_context_tracking.o += -DDISABLE_BRANCH_PROFILING +endif + # Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() # in coverage traces. KCOV_INSTRUMENT_softirq.o := n diff --git a/kernel/acct.c b/kernel/acct.c index 31222e8cd534..6520baa13669 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -103,48 +103,50 @@ struct bsd_acct_struct { atomic_long_t count; struct rcu_head rcu; struct mutex lock; - int active; + bool active; + bool check_space; unsigned long needcheck; struct file *file; struct pid_namespace *ns; struct work_struct work; struct completion done; + acct_t ac; }; -static void do_acct_process(struct bsd_acct_struct *acct); +static void fill_ac(struct bsd_acct_struct *acct); +static void acct_write_process(struct bsd_acct_struct *acct); /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct bsd_acct_struct *acct) +static bool check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - if (time_is_after_jiffies(acct->needcheck)) - goto out; + if (!acct->check_space) + return acct->active; /* May block */ if (vfs_statfs(&acct->file->f_path, &sbuf)) - goto out; + return acct->active; if (acct->active) { u64 suspend = sbuf.f_blocks * SUSPEND; do_div(suspend, 100); if (sbuf.f_bavail <= suspend) { - acct->active = 0; + acct->active = false; pr_info("Process accounting paused\n"); } } else { u64 resume = sbuf.f_blocks * RESUME; do_div(resume, 100); if (sbuf.f_bavail >= resume) { - acct->active = 1; + acct->active = true; pr_info("Process accounting resumed\n"); } } acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; -out: return acct->active; } @@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin) { struct bsd_acct_struct *acct = to_acct(pin); mutex_lock(&acct->lock); - do_acct_process(acct); + /* + * Fill the accounting struct with the exiting task's info + * before punting to the workqueue. + */ + fill_ac(acct); schedule_work(&acct->work); wait_for_completion(&acct->done); cmpxchg(&acct->ns->bacct, pin, NULL); @@ -202,6 +208,9 @@ static void close_work(struct work_struct *work) { struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); struct file *file = acct->file; + + /* We were fired by acct_pin_kill() which holds acct->lock. */ + acct_write_process(acct); if (file->f_op->flush) file->f_op->flush(file, NULL); __fput_sync(file); @@ -234,6 +243,20 @@ static int acct_on(struct filename *pathname) return -EACCES; } + /* Exclude kernel kernel internal filesystems. */ + if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) { + kfree(acct); + filp_close(file, NULL); + return -EINVAL; + } + + /* Exclude procfs and sysfs. */ + if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) { + kfree(acct); + filp_close(file, NULL); + return -EINVAL; + } + if (!(file->f_mode & FMODE_CAN_WRITE)) { kfree(acct); filp_close(file, NULL); @@ -430,13 +453,27 @@ static u32 encode_float(u64 value) * do_exit() or when switching to a different output file. */ -static void fill_ac(acct_t *ac) +static void fill_ac(struct bsd_acct_struct *acct) { struct pacct_struct *pacct = ¤t->signal->pacct; + struct file *file = acct->file; + acct_t *ac = &acct->ac; u64 elapsed, run_time; time64_t btime; struct tty_struct *tty; + lockdep_assert_held(&acct->lock); + + if (time_is_after_jiffies(acct->needcheck)) { + acct->check_space = false; + + /* Don't fill in @ac if nothing will be written. */ + if (!acct->active) + return; + } else { + acct->check_space = true; + } + /* * Fill the accounting struct with the needed info as recorded * by the different kernel functions. @@ -484,64 +521,61 @@ static void fill_ac(acct_t *ac) ac->ac_majflt = encode_comp_t(pacct->ac_majflt); ac->ac_exitcode = pacct->ac_exitcode; spin_unlock_irq(¤t->sighand->siglock); -} -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct) -{ - acct_t ac; - unsigned long flim; - const struct cred *orig_cred; - struct file *file = acct->file; - /* - * Accounting records are not subject to resource limits. - */ - flim = rlimit(RLIMIT_FSIZE); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - /* Perform file operations on behalf of whoever enabled accounting */ - orig_cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct)) - goto out; - - fill_ac(&ac); /* we really need to bite the bullet and change layout */ - ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); - ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); + ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid()); + ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid()); #if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ - ac.ac_uid16 = ac.ac_uid; - ac.ac_gid16 = ac.ac_gid; + ac->ac_uid16 = ac->ac_uid; + ac->ac_gid16 = ac->ac_gid; #elif ACCT_VERSION == 3 { struct pid_namespace *ns = acct->ns; - ac.ac_pid = task_tgid_nr_ns(current, ns); + ac->ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); - ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), - ns); + ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); rcu_read_unlock(); } #endif +} + +static void acct_write_process(struct bsd_acct_struct *acct) +{ + struct file *file = acct->file; + const struct cred *cred; + acct_t *ac = &acct->ac; + + /* Perform file operations on behalf of whoever enabled accounting */ + cred = override_creds(file->f_cred); + /* - * Get freeze protection. If the fs is frozen, just skip the write - * as we could deadlock the system otherwise. + * First check to see if there is enough free_space to continue + * the process accounting system. Then get freeze protection. If + * the fs is frozen, just skip the write as we could deadlock + * the system otherwise. */ - if (file_start_write_trylock(file)) { + if (check_free_space(acct) && file_start_write_trylock(file)) { /* it's been opened O_APPEND, so position is irrelevant */ loff_t pos = 0; - __kernel_write(file, &ac, sizeof(acct_t), &pos); + __kernel_write(file, ac, sizeof(acct_t), &pos); file_end_write(file); } -out: + + revert_creds(cred); +} + +static void do_acct_process(struct bsd_acct_struct *acct) +{ + unsigned long flim; + + /* Accounting records are not subject to resource limits. */ + flim = rlimit(RLIMIT_FSIZE); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + fill_ac(acct); + acct_write_process(acct); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - revert_creds(orig_cred); } /** diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 7f358740e958..367eaf2c78b7 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -350,11 +350,10 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - if (d_is_positive(d)) { - /* update watch filter fields */ - watch->dev = d->d_sb->s_dev; - watch->ino = d_backing_inode(d)->i_ino; - } + /* update watch filter fields */ + watch->dev = d->d_sb->s_dev; + watch->ino = d_backing_inode(d)->i_ino; + inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; @@ -419,10 +418,11 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - if (ret) { + if (ret && ret != -ENOENT) { audit_put_watch(watch); return ret; } + ret = 0; /* either find an old parent or attach a new one */ parent = audit_find_parent(d_backing_inode(parent_path.dentry)); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9c853cde9abe..78fd876a5473 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2207,10 +2207,8 @@ __audit_reusename(const __user char *uptr) list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; - if (n->name->uptr == uptr) { - atomic_inc(&n->name->refcnt); - return n->name; - } + if (n->name->uptr == uptr) + return refname(n->name); } return NULL; } @@ -2237,7 +2235,7 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; - atomic_inc(&name->refcnt); + refname(name); } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2369,7 +2367,7 @@ out_alloc: return; if (name) { n->name = name; - atomic_inc(&name->refcnt); + refname(name); } out: @@ -2496,7 +2494,7 @@ void __audit_inode_child(struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - atomic_inc(&found_child->name->refcnt); + refname(found_child->name); } } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 410028633621..70502f038b92 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 870aeb51d70a..0d56cea71602 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -39,7 +39,7 @@ */ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ -#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8) +#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { @@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_SIGSEGV; /* Account into memcg of the process that created bpf_arena */ - ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); + ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); return VM_FAULT_SIGSEGV; @@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (ret) goto out_free_pages; - ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, - node_id, page_cnt, pages); + ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); if (ret) goto out; @@ -577,8 +576,8 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) -BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index d5dc65bb1755..148da8f7ff36 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -153,7 +153,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, NULL); + bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy); } /* *gfp_flags* is a hidden argument provided by the verifier */ @@ -161,6 +161,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; + bool nobusy; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) @@ -169,21 +170,21 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!cgroup) return (unsigned long)NULL; - if (!bpf_cgrp_storage_trylock()) - return (unsigned long)NULL; + nobusy = bpf_cgrp_storage_trylock(); - sdata = cgroup_storage_lookup(cgroup, map, true); + sdata = cgroup_storage_lookup(cgroup, map, nobusy); if (sdata) goto unlock; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, false, gfp_flags); unlock: - bpf_cgrp_storage_unlock(); + if (nobusy) + bpf_cgrp_storage_unlock(); return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 106735145948..380e9a7cac75 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -335,7 +335,7 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo, tinfo->btf_id = prog->aux->attach_btf_id; } -bool bpf_iter_prog_supported(struct bpf_prog *prog) +int bpf_iter_prog_supported(struct bpf_prog *prog) { const char *attach_fname = prog->aux->attach_func_name; struct bpf_iter_target_info *tinfo = NULL, *iter; @@ -344,7 +344,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) int prefix_len = strlen(prefix); if (strncmp(attach_fname, prefix, prefix_len)) - return false; + return -EINVAL; mutex_lock(&targets_mutex); list_for_each_entry(iter, &targets, list) { @@ -360,12 +360,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) } mutex_unlock(&targets_mutex); - if (tinfo) { - prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; - prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; - } + if (!tinfo) + return -EINVAL; - return tinfo != NULL; + return bpf_prog_ctx_arg_info_init(prog, tinfo->reg_info->ctx_arg_info, + tinfo->reg_info->ctx_arg_info_size); } const struct bpf_func_proto * diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 967492b65185..0a59df1c550a 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -316,7 +316,9 @@ BTF_ID(func, bpf_lsm_inode_getxattr) BTF_ID(func, bpf_lsm_inode_mknod) BTF_ID(func, bpf_lsm_inode_need_killpriv) BTF_ID(func, bpf_lsm_inode_post_setxattr) +BTF_ID(func, bpf_lsm_inode_post_removexattr) BTF_ID(func, bpf_lsm_inode_readlink) +BTF_ID(func, bpf_lsm_inode_removexattr) BTF_ID(func, bpf_lsm_inode_rename) BTF_ID(func, bpf_lsm_inode_rmdir) BTF_ID(func, bpf_lsm_inode_setattr) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 040fb1cd840b..db13ee70d94d 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -146,39 +146,7 @@ void bpf_struct_ops_image_free(void *image) } #define MAYBE_NULL_SUFFIX "__nullable" -#define MAX_STUB_NAME 128 - -/* Return the type info of a stub function, if it exists. - * - * The name of a stub function is made up of the name of the struct_ops and - * the name of the function pointer member, separated by "__". For example, - * if the struct_ops type is named "foo_ops" and the function pointer - * member is named "bar", the stub function name would be "foo_ops__bar". - */ -static const struct btf_type * -find_stub_func_proto(const struct btf *btf, const char *st_op_name, - const char *member_name) -{ - char stub_func_name[MAX_STUB_NAME]; - const struct btf_type *func_type; - s32 btf_id; - int cp; - - cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s", - st_op_name, member_name); - if (cp >= MAX_STUB_NAME) { - pr_warn("Stub function name too long\n"); - return NULL; - } - btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC); - if (btf_id < 0) - return NULL; - func_type = btf_type_by_id(btf, btf_id); - if (!func_type) - return NULL; - - return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */ -} +#define REFCOUNTED_SUFFIX "__ref" /* Prepare argument info for every nullable argument of a member of a * struct_ops type. @@ -203,27 +171,44 @@ find_stub_func_proto(const struct btf *btf, const char *st_op_name, static int prepare_arg_info(struct btf *btf, const char *st_ops_name, const char *member_name, - const struct btf_type *func_proto, + const struct btf_type *func_proto, void *stub_func_addr, struct bpf_struct_ops_arg_info *arg_info) { const struct btf_type *stub_func_proto, *pointed_type; + bool is_nullable = false, is_refcounted = false; const struct btf_param *stub_args, *args; struct bpf_ctx_arg_aux *info, *info_buf; u32 nargs, arg_no, info_cnt = 0; + char ksym[KSYM_SYMBOL_LEN]; + const char *stub_fname; + const char *suffix; + s32 stub_func_id; u32 arg_btf_id; int offset; - stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name); - if (!stub_func_proto) - return 0; + stub_fname = kallsyms_lookup((unsigned long)stub_func_addr, NULL, NULL, NULL, ksym); + if (!stub_fname) { + pr_warn("Cannot find the stub function name for the %s in struct %s\n", + member_name, st_ops_name); + return -ENOENT; + } + + stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC); + if (stub_func_id < 0) { + pr_warn("Cannot find the stub function %s in btf\n", stub_fname); + return -ENOENT; + } + + stub_func_proto = btf_type_by_id(btf, stub_func_id); + stub_func_proto = btf_type_by_id(btf, stub_func_proto->type); /* Check if the number of arguments of the stub function is the same * as the number of arguments of the function pointer. */ nargs = btf_type_vlen(func_proto); if (nargs != btf_type_vlen(stub_func_proto)) { - pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n", - st_ops_name, member_name, member_name, st_ops_name); + pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n", + stub_fname, member_name, st_ops_name); return -EINVAL; } @@ -241,10 +226,18 @@ static int prepare_arg_info(struct btf *btf, info = info_buf; for (arg_no = 0; arg_no < nargs; arg_no++) { /* Skip arguments that is not suffixed with - * "__nullable". + * "__nullable or __ref". */ - if (!btf_param_match_suffix(btf, &stub_args[arg_no], - MAYBE_NULL_SUFFIX)) + is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no], + MAYBE_NULL_SUFFIX); + is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no], + REFCOUNTED_SUFFIX); + + if (is_nullable) + suffix = MAYBE_NULL_SUFFIX; + else if (is_refcounted) + suffix = REFCOUNTED_SUFFIX; + else continue; /* Should be a pointer to struct */ @@ -253,30 +246,34 @@ static int prepare_arg_info(struct btf *btf, &arg_btf_id); if (!pointed_type || !btf_type_is_struct(pointed_type)) { - pr_warn("stub function %s__%s has %s tagging to an unsupported type\n", - st_ops_name, member_name, MAYBE_NULL_SUFFIX); + pr_warn("stub function %s has %s tagging to an unsupported type\n", + stub_fname, suffix); goto err_out; } offset = btf_ctx_arg_offset(btf, func_proto, arg_no); if (offset < 0) { - pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n", - st_ops_name, member_name, arg_no); + pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n", + stub_fname, arg_no); goto err_out; } if (args[arg_no].type != stub_args[arg_no].type) { - pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n", - arg_no, st_ops_name, member_name); + pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n", + arg_no, stub_fname); goto err_out; } /* Fill the information of the new argument */ - info->reg_type = - PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; info->btf_id = arg_btf_id; info->btf = btf; info->offset = offset; + if (is_nullable) { + info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; + } else if (is_refcounted) { + info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID; + info->refcounted = true; + } info++; info_cnt++; @@ -324,6 +321,13 @@ static bool is_module_member(const struct btf *btf, u32 id) return !strcmp(btf_name_by_offset(btf, t->name_off), "module"); } +int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) +{ + void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); + + return func_ptr ? 0 : -ENOTSUPP; +} + int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, struct btf *btf, struct bpf_verifier_log *log) @@ -386,8 +390,11 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, st_ops_desc->value_type = btf_type_by_id(btf, value_id); for_each_member(i, t, member) { - const struct btf_type *func_proto; + const struct btf_type *func_proto, *ret_type; + void **stub_func_addr; + u32 moff; + moff = __btf_member_bit_offset(t, member) / 8; mname = btf_name_by_offset(btf, member->name_off); if (!*mname) { pr_warn("anon member in struct %s is not supported\n", @@ -413,9 +420,23 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, func_proto = btf_type_resolve_func_ptr(btf, member->type, NULL); - if (!func_proto) + + /* The member is not a function pointer or + * the function pointer is not supported. + */ + if (!func_proto || bpf_struct_ops_supported(st_ops, moff)) continue; + if (func_proto->type) { + ret_type = btf_type_resolve_ptr(btf, func_proto->type, NULL); + if (ret_type && !__btf_type_is_struct(ret_type)) { + pr_warn("func ptr %s in struct %s returns non-struct pointer, which is not supported\n", + mname, st_ops->name); + err = -EOPNOTSUPP; + goto errout; + } + } + if (btf_distill_func_proto(log, btf, func_proto, mname, &st_ops->func_models[i])) { @@ -425,8 +446,9 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, goto errout; } + stub_func_addr = *(void **)(st_ops->cfi_stubs + moff); err = prepare_arg_info(btf, st_ops->name, mname, - func_proto, + func_proto, stub_func_addr, arg_info + i); if (err) goto errout; @@ -1152,13 +1174,6 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } -int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) -{ - void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); - - return func_ptr ? 0 : -ENOTSUPP; -} - static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9de6acddd479..16ba36f34dfa 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -606,6 +606,7 @@ s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) spin_unlock_bh(&btf_idr_lock); return ret; } +EXPORT_SYMBOL_GPL(bpf_find_btf_id); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, u32 id, u32 *res_id) @@ -2575,7 +2576,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { + if (btf_type_kflag(t) && !btf_type_is_type_tag(t)) { btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); return -EINVAL; } @@ -3332,6 +3333,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, u32 off, int sz, struct btf_field_info *info, u32 field_mask) { enum btf_field_type type; + const char *tag_value; + bool is_type_tag; u32 res_id; /* Permit modifiers on the pointer itself */ @@ -3341,19 +3344,20 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, if (!btf_type_is_ptr(t)) return BTF_FIELD_IGNORE; t = btf_type_by_id(btf, t->type); - - if (!btf_type_is_type_tag(t)) + is_type_tag = btf_type_is_type_tag(t) && !btf_type_kflag(t); + if (!is_type_tag) return BTF_FIELD_IGNORE; /* Reject extra tags */ if (btf_type_is_type_tag(btf_type_by_id(btf, t->type))) return -EINVAL; - if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off))) + tag_value = __btf_name_by_offset(btf, t->name_off); + if (!strcmp("kptr_untrusted", tag_value)) type = BPF_KPTR_UNREF; - else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("kptr", tag_value)) type = BPF_KPTR_REF; - else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("percpu_kptr", tag_value)) type = BPF_KPTR_PERCPU; - else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off))) + else if (!strcmp("uptr", tag_value)) type = BPF_UPTR; else return -EINVAL; @@ -3477,6 +3481,15 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ goto end; } } + if (field_mask & BPF_RES_SPIN_LOCK) { + if (!strcmp(name, "bpf_res_spin_lock")) { + if (*seen_mask & BPF_RES_SPIN_LOCK) + return -E2BIG; + *seen_mask |= BPF_RES_SPIN_LOCK; + type = BPF_RES_SPIN_LOCK; + goto end; + } + } if (field_mask & BPF_TIMER) { if (!strcmp(name, "bpf_timer")) { if (*seen_mask & BPF_TIMER) @@ -3655,6 +3668,7 @@ static int btf_find_field_one(const struct btf *btf, switch (field_type) { case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_WORKQUEUE: case BPF_LIST_NODE: @@ -3948,6 +3962,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type return ERR_PTR(-ENOMEM); rec->spin_lock_off = -EINVAL; + rec->res_spin_lock_off = -EINVAL; rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; @@ -3975,6 +3990,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->spin_lock_off = rec->fields[i].offset; break; + case BPF_RES_SPIN_LOCK: + WARN_ON_ONCE(rec->spin_lock_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->res_spin_lock_off = rec->fields[i].offset; + break; case BPF_TIMER: WARN_ON_ONCE(rec->timer_off >= 0); /* Cache offset for faster lookup at runtime */ @@ -4018,9 +4038,15 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->cnt++; } + if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) { + ret = -EINVAL; + goto end; + } + /* bpf_{list_head, rb_node} require bpf_spin_lock */ if ((btf_record_has_field(rec, BPF_LIST_HEAD) || - btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) { + btf_record_has_field(rec, BPF_RB_ROOT)) && + (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) { ret = -EINVAL; goto end; } @@ -4944,11 +4970,6 @@ static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (btf_type_kflag(t)) { - btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); - return -EINVAL; - } - component_idx = btf_type_decl_tag(t)->component_idx; if (component_idx < -1) { btf_verifier_log_type(env, t, "Invalid component_idx"); @@ -5638,7 +5659,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) type = &tab->types[tab->cnt]; type->btf_id = i; - record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | + record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT | BPF_KPTR, t->size); /* The record cannot be unset, treat it as an error if so */ @@ -6507,6 +6528,10 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { /* rxrpc */ { "rxrpc_recvdata", 0x1 }, { "rxrpc_resend", 0x10 }, + { "rxrpc_tq", 0x10 }, + { "rxrpc_client", 0x1 }, + /* skb */ + {"kfree_skb", 0x1000}, /* sunrpc */ { "xs_stream_read_data", 0x1 }, /* ... from xprt_cong_event event class */ @@ -6527,6 +6552,103 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { { "mr_integ_alloc", 0x2000 }, /* bpf_testmod */ { "bpf_testmod_test_read", 0x0 }, + /* amdgpu */ + { "amdgpu_vm_bo_map", 0x1 }, + { "amdgpu_vm_bo_unmap", 0x1 }, + /* netfs */ + { "netfs_folioq", 0x1 }, + /* xfs from xfs_defer_pending_class */ + { "xfs_defer_create_intent", 0x1 }, + { "xfs_defer_cancel_list", 0x1 }, + { "xfs_defer_pending_finish", 0x1 }, + { "xfs_defer_pending_abort", 0x1 }, + { "xfs_defer_relog_intent", 0x1 }, + { "xfs_defer_isolate_paused", 0x1 }, + { "xfs_defer_item_pause", 0x1 }, + { "xfs_defer_item_unpause", 0x1 }, + /* xfs from xfs_defer_pending_item_class */ + { "xfs_defer_add_item", 0x1 }, + { "xfs_defer_cancel_item", 0x1 }, + { "xfs_defer_finish_item", 0x1 }, + /* xfs from xfs_icwalk_class */ + { "xfs_ioc_free_eofblocks", 0x10 }, + { "xfs_blockgc_free_space", 0x10 }, + /* xfs from xfs_btree_cur_class */ + { "xfs_btree_updkeys", 0x100 }, + { "xfs_btree_overlapped_query_range", 0x100 }, + /* xfs from xfs_imap_class*/ + { "xfs_map_blocks_found", 0x10000 }, + { "xfs_map_blocks_alloc", 0x10000 }, + { "xfs_iomap_alloc", 0x1000 }, + { "xfs_iomap_found", 0x1000 }, + /* xfs from xfs_fs_class */ + { "xfs_inodegc_flush", 0x1 }, + { "xfs_inodegc_push", 0x1 }, + { "xfs_inodegc_start", 0x1 }, + { "xfs_inodegc_stop", 0x1 }, + { "xfs_inodegc_queue", 0x1 }, + { "xfs_inodegc_throttle", 0x1 }, + { "xfs_fs_sync_fs", 0x1 }, + { "xfs_blockgc_start", 0x1 }, + { "xfs_blockgc_stop", 0x1 }, + { "xfs_blockgc_worker", 0x1 }, + { "xfs_blockgc_flush_all", 0x1 }, + /* xfs_scrub */ + { "xchk_nlinks_live_update", 0x10 }, + /* xfs_scrub from xchk_metapath_class */ + { "xchk_metapath_lookup", 0x100 }, + /* nfsd */ + { "nfsd_dirent", 0x1 }, + { "nfsd_file_acquire", 0x1001 }, + { "nfsd_file_insert_err", 0x1 }, + { "nfsd_file_cons_err", 0x1 }, + /* nfs4 */ + { "nfs4_setup_sequence", 0x1 }, + { "pnfs_update_layout", 0x10000 }, + { "nfs4_inode_callback_event", 0x200 }, + { "nfs4_inode_stateid_callback_event", 0x200 }, + /* nfs from pnfs_layout_event */ + { "pnfs_mds_fallback_pg_init_read", 0x10000 }, + { "pnfs_mds_fallback_pg_init_write", 0x10000 }, + { "pnfs_mds_fallback_pg_get_mirror_count", 0x10000 }, + { "pnfs_mds_fallback_read_done", 0x10000 }, + { "pnfs_mds_fallback_write_done", 0x10000 }, + { "pnfs_mds_fallback_read_pagelist", 0x10000 }, + { "pnfs_mds_fallback_write_pagelist", 0x10000 }, + /* coda */ + { "coda_dec_pic_run", 0x10 }, + { "coda_dec_pic_done", 0x10 }, + /* cfg80211 */ + { "cfg80211_scan_done", 0x11 }, + { "rdev_set_coalesce", 0x10 }, + { "cfg80211_report_wowlan_wakeup", 0x100 }, + { "cfg80211_inform_bss_frame", 0x100 }, + { "cfg80211_michael_mic_failure", 0x10000 }, + /* cfg80211 from wiphy_work_event */ + { "wiphy_work_queue", 0x10 }, + { "wiphy_work_run", 0x10 }, + { "wiphy_work_cancel", 0x10 }, + { "wiphy_work_flush", 0x10 }, + /* hugetlbfs */ + { "hugetlbfs_alloc_inode", 0x10 }, + /* spufs */ + { "spufs_context", 0x10 }, + /* kvm_hv */ + { "kvm_page_fault_enter", 0x100 }, + /* dpu */ + { "dpu_crtc_setup_mixer", 0x100 }, + /* binder */ + { "binder_transaction", 0x100 }, + /* bcachefs */ + { "btree_path_free", 0x100 }, + /* hfi1_tx */ + { "hfi1_sdma_progress", 0x1000 }, + /* iptfs */ + { "iptfs_ingress_postq_event", 0x1000 }, + /* neigh */ + { "neigh_update", 0x10 }, + /* snd_firewire_lib */ + { "amdtp_packet", 0x100 }, }; bool btf_ctx_access(int off, int size, enum bpf_access_type type, @@ -6677,6 +6799,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->reg_type = ctx_arg_info->reg_type; info->btf = ctx_arg_info->btf ? : btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; + info->ref_obj_id = ctx_arg_info->ref_obj_id; return true; } } @@ -6743,7 +6866,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->btf_id = t->type; t = btf_type_by_id(btf, t->type); - if (btf_type_is_type_tag(t)) { + if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); if (strcmp(tag_value, "user") == 0) info->reg_type |= MEM_USER; @@ -7002,7 +7125,7 @@ error: /* check type tag */ t = btf_type_by_id(btf, mtype->type); - if (btf_type_is_type_tag(t)) { + if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); /* check __user tag */ if (strcmp(tag_value, "user") == 0) @@ -8524,6 +8647,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: return BTF_KFUNC_HOOK_CGROUP; case BPF_PROG_TYPE_SCHED_ACT: return BTF_KFUNC_HOOK_SCHED_ACT; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 46e5db65dbc8..84f58f3d028a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -369,7 +369,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) /* count number of elements in the list. * it's slow but the list cannot be long */ -static u32 prog_list_length(struct hlist_head *head) +static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt) { struct bpf_prog_list *pl; u32 cnt = 0; @@ -377,6 +377,8 @@ static u32 prog_list_length(struct hlist_head *head) hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; + if (preorder_cnt && (pl->flags & BPF_F_PREORDER)) + (*preorder_cnt)++; cnt++; } return cnt; @@ -400,7 +402,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, if (flags & BPF_F_ALLOW_MULTI) return true; - cnt = prog_list_length(&p->bpf.progs[atype]); + cnt = prog_list_length(&p->bpf.progs[atype], NULL); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); @@ -423,12 +425,12 @@ static int compute_effective_progs(struct cgroup *cgrp, struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; - int cnt = 0; + int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart; /* count number of effective programs by walking parents */ do { if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[atype]); + cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt); p = cgroup_parent(p); } while (p); @@ -439,20 +441,34 @@ static int compute_effective_progs(struct cgroup *cgrp, /* populate the array with effective progs */ cnt = 0; p = cgrp; + fstart = preorder_cnt; + bstart = preorder_cnt - 1; do { if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; + init_bstart = bstart; hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; - item = &progs->items[cnt]; + if (pl->flags & BPF_F_PREORDER) { + item = &progs->items[bstart]; + bstart--; + } else { + item = &progs->items[fstart]; + fstart++; + } item->prog = prog_list_prog(pl); bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage); cnt++; } + + /* reverse pre-ordering progs at this cgroup level */ + for (i = bstart + 1, j = init_bstart; i < j; i++, j--) + swap(progs->items[i], progs->items[j]); + } while ((p = cgroup_parent(p))); *array = progs; @@ -663,7 +679,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, */ return -EPERM; - if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; pl = find_attach_entry(progs, prog, link, replace_prog, @@ -698,6 +714,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; + pl->flags = flags; bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; @@ -1073,7 +1090,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, lockdep_is_held(&cgroup_mutex)); total_cnt += bpf_prog_array_length(effective); } else { - total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); + total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL); } } @@ -1105,7 +1122,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, u32 id; progs = &cgrp->bpf.progs[atype]; - cnt = min_t(int, prog_list_length(progs), total_cnt); + cnt = min_t(int, prog_list_length(progs, NULL), total_cnt); i = 0; hlist_for_each_entry(pl, progs, node) { prog = prog_list_prog(pl); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index da729cbbaeb9..ba6b6118cf50 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1663,14 +1663,17 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ INSN_2(JMP32, JA), \ + /* Atomic operations. */ \ + INSN_3(STX, ATOMIC, B), \ + INSN_3(STX, ATOMIC, H), \ + INSN_3(STX, ATOMIC, W), \ + INSN_3(STX, ATOMIC, DW), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ - INSN_3(STX, ATOMIC, W), \ - INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ @@ -2152,24 +2155,33 @@ out: if (BPF_SIZE(insn->code) == BPF_W) \ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ (DST + insn->off)); \ - else \ + else if (BPF_SIZE(insn->code) == BPF_DW) \ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ (DST + insn->off)); \ + else \ + goto default_label; \ break; \ case BOP | BPF_FETCH: \ if (BPF_SIZE(insn->code) == BPF_W) \ SRC = (u32) atomic_fetch_##KOP( \ (u32) SRC, \ (atomic_t *)(unsigned long) (DST + insn->off)); \ - else \ + else if (BPF_SIZE(insn->code) == BPF_DW) \ SRC = (u64) atomic64_fetch_##KOP( \ (u64) SRC, \ (atomic64_t *)(unsigned long) (DST + insn->off)); \ + else \ + goto default_label; \ break; STX_ATOMIC_DW: STX_ATOMIC_W: + STX_ATOMIC_H: + STX_ATOMIC_B: switch (IMM) { + /* Atomic read-modify-write instructions support only W and DW + * size modifiers. + */ ATOMIC_ALU_OP(BPF_ADD, add) ATOMIC_ALU_OP(BPF_AND, and) ATOMIC_ALU_OP(BPF_OR, or) @@ -2181,20 +2193,63 @@ out: SRC = (u32) atomic_xchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) SRC); - else + else if (BPF_SIZE(insn->code) == BPF_DW) SRC = (u64) atomic64_xchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) SRC); + else + goto default_label; break; case BPF_CMPXCHG: if (BPF_SIZE(insn->code) == BPF_W) BPF_R0 = (u32) atomic_cmpxchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) BPF_R0, (u32) SRC); - else + else if (BPF_SIZE(insn->code) == BPF_DW) BPF_R0 = (u64) atomic64_cmpxchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) BPF_R0, (u64) SRC); + else + goto default_label; + break; + /* Atomic load and store instructions support all size + * modifiers. + */ + case BPF_LOAD_ACQ: + switch (BPF_SIZE(insn->code)) { +#define LOAD_ACQUIRE(SIZEOP, SIZE) \ + case BPF_##SIZEOP: \ + DST = (SIZE)smp_load_acquire( \ + (SIZE *)(unsigned long)(SRC + insn->off)); \ + break; + LOAD_ACQUIRE(B, u8) + LOAD_ACQUIRE(H, u16) + LOAD_ACQUIRE(W, u32) +#ifdef CONFIG_64BIT + LOAD_ACQUIRE(DW, u64) +#endif +#undef LOAD_ACQUIRE + default: + goto default_label; + } + break; + case BPF_STORE_REL: + switch (BPF_SIZE(insn->code)) { +#define STORE_RELEASE(SIZEOP, SIZE) \ + case BPF_##SIZEOP: \ + smp_store_release( \ + (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \ + break; + STORE_RELEASE(B, u8) + STORE_RELEASE(H, u16) + STORE_RELEASE(W, u32) +#ifdef CONFIG_64BIT + STORE_RELEASE(DW, u64) +#endif +#undef STORE_RELEASE + default: + goto default_label; + } break; default: @@ -2290,17 +2345,18 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) insn->code = BPF_JMP | BPF_CALL_ARGS; } #endif -#else +#endif + static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, so warn about it! + * is not working properly, or interpreter is being used when + * prog->jit_requested is not 0, so warn about it! */ WARN_ON_ONCE(1); return 0; } -#endif bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) @@ -2380,8 +2436,18 @@ static void bpf_prog_select_func(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + u32 idx = (round_up(stack_depth, 32) / 32) - 1; - fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; + /* may_goto may cause stack size > 512, leading to idx out-of-bounds. + * But for non-JITed programs, we don't need bpf_func, so no bounds + * check needed. + */ + if (!fp->jit_requested && + !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + fp->bpf_func = interpreters[idx]; + } else { + fp->bpf_func = __bpf_prog_ret0_warn; + } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif @@ -2906,6 +2972,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void) return NULL; } +const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void) +{ + return NULL; +} + u64 __weak bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) @@ -3058,6 +3129,32 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, { } +bool __weak bpf_jit_supports_timed_may_goto(void) +{ + return false; +} + +u64 __weak arch_bpf_timed_may_goto(void) +{ + return 0; +} + +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) +{ + u64 time = ktime_get_mono_fast_ns(); + + /* Populate the timestamp for this stack frame, and refresh count. */ + if (!p->timestamp) { + p->timestamp = time; + return BPF_MAX_TIMED_LOOPS; + } + /* Check if we've exhausted our time slice, and zero count. */ + if (time - p->timestamp >= (NSEC_PER_SEC / 4)) + return 0; + /* Refresh the count for the stack frame. */ + return BPF_MAX_TIMED_LOOPS; +} + /* for configs without MMU or 32-bit */ __weak const struct bpf_map_ops arena_map_ops; __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 774accbd4a22..67e8a2fc1a99 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -33,8 +33,8 @@ #include <trace/events/xdp.h> #include <linux/btf_ids.h> -#include <linux/netdevice.h> /* netif_receive_skb_list */ -#include <linux/etherdevice.h> /* eth_type_trans */ +#include <linux/netdevice.h> +#include <net/gro.h> /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is @@ -68,6 +68,7 @@ struct bpf_cpu_map_entry { struct bpf_cpumap_val value; struct bpf_prog *prog; + struct gro_node gro; struct completion kthread_running; struct rcu_work free_work; @@ -133,22 +134,23 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) } } -static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, - struct list_head *listp, - struct xdp_cpumap_stats *stats) +static u32 cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, + void **skbs, u32 skb_n, + struct xdp_cpumap_stats *stats) { - struct sk_buff *skb, *tmp; struct xdp_buff xdp; - u32 act; + u32 act, pass = 0; int err; - list_for_each_entry_safe(skb, tmp, listp, list) { + for (u32 i = 0; i < skb_n; i++) { + struct sk_buff *skb = skbs[i]; + act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog); switch (act) { case XDP_PASS: + skbs[pass++] = skb; break; case XDP_REDIRECT: - skb_list_del_init(skb); err = xdp_do_generic_redirect(skb->dev, skb, &xdp, rcpu->prog); if (unlikely(err)) { @@ -157,7 +159,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, } else { stats->redirect++; } - return; + break; default: bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; @@ -165,12 +167,15 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, trace_xdp_exception(skb->dev, rcpu->prog, act); fallthrough; case XDP_DROP: - skb_list_del_init(skb); - kfree_skb(skb); + napi_consume_skb(skb, true); stats->drop++; - return; + break; } } + + stats->pass += pass; + + return pass; } static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, @@ -204,7 +209,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, stats->drop++; } else { frames[nframes++] = xdpf; - stats->pass++; } break; case XDP_REDIRECT: @@ -228,43 +232,65 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } xdp_clear_return_frame_no_direct(); + stats->pass += nframes; return nframes; } #define CPUMAP_BATCH 8 -static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, - int xdp_n, struct xdp_cpumap_stats *stats, - struct list_head *list) +struct cpu_map_ret { + u32 xdp_n; + u32 skb_n; +}; + +static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, + void **skbs, struct cpu_map_ret *ret, + struct xdp_cpumap_stats *stats) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; - int nframes; if (!rcpu->prog) - return xdp_n; + goto out; - rcu_read_lock_bh(); + rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats); + ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); + if (unlikely(ret->skb_n)) + ret->skb_n = cpu_map_bpf_prog_run_skb(rcpu, skbs, ret->skb_n, + stats); if (stats->redirect) xdp_do_flush(); - if (unlikely(!list_empty(list))) - cpu_map_bpf_prog_run_skb(rcpu, list, stats); - bpf_net_ctx_clear(bpf_net_ctx); - rcu_read_unlock_bh(); /* resched point, may call do_softirq() */ + rcu_read_unlock(); - return nframes; +out: + if (unlikely(ret->skb_n) && ret->xdp_n) + memmove(&skbs[ret->xdp_n], skbs, ret->skb_n * sizeof(*skbs)); +} + +static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty) +{ + /* + * If the ring is not empty, there'll be a new iteration soon, and we + * only need to do a full flush if a tick is long (> 1 ms). + * If the ring is empty, to not hold GRO packets in the stack for too + * long, do a full flush. + * This is equivalent to how NAPI decides whether to perform a full + * flush. + */ + gro_flush(&rcpu->gro, !empty && HZ >= 1000); + gro_normal_list(&rcpu->gro); } static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; unsigned long last_qs = jiffies; + u32 packets = 0; complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); @@ -277,11 +303,11 @@ static int cpu_map_kthread_run(void *data) while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { struct xdp_cpumap_stats stats = {}; /* zero stats */ unsigned int kmem_alloc_drops = 0, sched = 0; - gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; - int i, n, m, nframes, xdp_n; + struct cpu_map_ret ret = { }; void *frames[CPUMAP_BATCH]; void *skbs[CPUMAP_BATCH]; - LIST_HEAD(list); + u32 i, n, m; + bool empty; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -306,7 +332,7 @@ static int cpu_map_kthread_run(void *data) */ n = __ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); - for (i = 0, xdp_n = 0; i < n; i++) { + for (i = 0; i < n; i++) { void *f = frames[i]; struct page *page; @@ -314,11 +340,11 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = f; __ptr_clear_bit(0, &skb); - list_add_tail(&skb->list, &list); + skbs[ret.skb_n++] = skb; continue; } - frames[xdp_n++] = f; + frames[ret.xdp_n++] = f; page = virt_to_page(f); /* Bring struct page memory area to curr CPU. Read by @@ -328,40 +354,51 @@ static int cpu_map_kthread_run(void *data) prefetchw(page); } + local_bh_disable(); + /* Support running another XDP prog on this CPU */ - nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list); - if (nframes) { - m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, - gfp, nframes, skbs); - if (unlikely(m == 0)) { - for (i = 0; i < nframes; i++) - skbs[i] = NULL; /* effect: xdp_return_frame */ - kmem_alloc_drops += nframes; - } + cpu_map_bpf_prog_run(rcpu, frames, skbs, &ret, &stats); + if (!ret.xdp_n) + goto stats; + + m = napi_skb_cache_get_bulk(skbs, ret.xdp_n); + if (unlikely(m < ret.xdp_n)) { + for (i = m; i < ret.xdp_n; i++) + xdp_return_frame(frames[i]); + + if (ret.skb_n) + memmove(&skbs[m], &skbs[ret.xdp_n], + ret.skb_n * sizeof(*skbs)); + + kmem_alloc_drops += ret.xdp_n - m; + ret.xdp_n = m; } - local_bh_disable(); - for (i = 0; i < nframes; i++) { + for (i = 0; i < ret.xdp_n; i++) { struct xdp_frame *xdpf = frames[i]; - struct sk_buff *skb = skbs[i]; - skb = __xdp_build_skb_from_frame(xdpf, skb, - xdpf->dev_rx); - if (!skb) { - xdp_return_frame(xdpf); - continue; - } - - list_add_tail(&skb->list, &list); + /* Can fail only when !skb -- already handled above */ + __xdp_build_skb_from_frame(xdpf, skbs[i], xdpf->dev_rx); } +stats: /* Feedback loop via tracepoint. * NB: keep before recv to allow measuring enqueue/dequeue latency. */ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, sched, &stats); - netif_receive_skb_list(&list); + for (i = 0; i < ret.xdp_n + ret.skb_n; i++) + gro_receive_skb(&rcpu->gro, skbs[i]); + + /* Flush either every 64 packets or in case of empty ring */ + packets += n; + empty = __ptr_ring_empty(rcpu->queue); + if (packets >= NAPI_POLL_WEIGHT || empty) { + cpu_map_gro_flush(rcpu, empty); + packets = 0; + } + local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); @@ -430,6 +467,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->cpu = cpu; rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; + gro_init(&rcpu->gro); if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) goto free_ptr_ring; @@ -458,6 +496,7 @@ free_prog: if (rcpu->prog) bpf_prog_put(rcpu->prog); free_ptr_ring: + gro_cleanup(&rcpu->gro); ptr_ring_cleanup(rcpu->queue, NULL); free_queue: kfree(rcpu->queue); @@ -487,6 +526,7 @@ static void __cpu_map_entry_free(struct work_struct *work) if (rcpu->prog) bpf_prog_put(rcpu->prog); + gro_cleanup(&rcpu->gro); /* The queue should be empty at this point */ __cpu_map_ring_cleanup(rcpu->queue); ptr_ring_cleanup(rcpu->queue, NULL); diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index cfa1c18e3a48..9876c5fe6c2a 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -45,6 +45,10 @@ __bpf_kfunc_start_defs(); * * bpf_cpumask_create() allocates memory using the BPF memory allocator, and * will not block. It may return NULL if no memory is available. + * + * Return: + * * A pointer to a new struct bpf_cpumask instance on success. + * * NULL if the BPF memory allocator is out of memory. */ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) { @@ -71,6 +75,10 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) * Acquires a reference to a BPF cpumask. The cpumask returned by this function * must either be embedded in a map as a kptr, or freed with * bpf_cpumask_release(). + * + * Return: + * * The struct bpf_cpumask pointer passed to the function. + * */ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) { @@ -106,6 +114,9 @@ CFI_NOSEAL(bpf_cpumask_release_dtor); * * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask * pointer may be safely passed to this function. + * + * Return: + * * The index of the first nonzero bit in the struct cpumask. */ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) { @@ -119,6 +130,9 @@ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) * * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask * pointer may be safely passed to this function. + * + * Return: + * * The index of the first zero bit in the struct cpumask. */ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) { @@ -133,6 +147,9 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) * * Find the index of the first nonzero bit of the AND of two cpumasks. * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + * + * Return: + * * The index of the first bit that is nonzero in both cpumask instances. */ __bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1, const struct cpumask *src2) @@ -414,12 +431,47 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, * @cpumask: The cpumask being queried. * * Count the number of set bits in the given cpumask. + * + * Return: + * * The number of bits set in the mask. */ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) { return cpumask_weight(cpumask); } +/** + * bpf_cpumask_populate() - Populate the CPU mask from the contents of + * a BPF memory region. + * + * @cpumask: The cpumask being populated. + * @src: The BPF memory holding the bit pattern. + * @src__sz: Length of the BPF memory region in bytes. + * + * Return: + * * 0 if the struct cpumask * instance was populated successfully. + * * -EACCES if the memory region is too small to populate the cpumask. + * * -EINVAL if the memory region is not aligned to the size of a long + * and the architecture does not support efficient unaligned accesses. + */ +__bpf_kfunc int bpf_cpumask_populate(struct cpumask *cpumask, void *src, size_t src__sz) +{ + unsigned long source = (unsigned long)src; + + /* The memory region must be large enough to populate the entire CPU mask. */ + if (src__sz < bitmap_size(nr_cpu_ids)) + return -EACCES; + + /* If avoiding unaligned accesses, the input region must be aligned to the nearest long. */ + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + !IS_ALIGNED(source, sizeof(long))) + return -EINVAL; + + bitmap_copy(cpumask_bits(cpumask), src, nr_cpu_ids); + + return 0; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(cpumask_kfunc_btf_ids) @@ -448,6 +500,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_populate, KF_RCU) BTF_KFUNCS_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 309c4aa1b026..20883c6b1546 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -202,7 +202,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (is_addr_space_cast(insn)) { - verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", + verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %u, %u)\n", insn->code, insn->dst_reg, insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); } else if (is_mov_percpu_addr(insn)) { @@ -267,6 +267,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ) { + verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_STORE_REL) { + verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } @@ -369,7 +381,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->code, class == BPF_JMP32 ? 'w' : 'r', insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->imm, insn->off); + (u32)insn->imm, insn->off); } } else { verbose(cbs->private_data, "(%02x) %s\n", diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4a9eeb7aef85..5a5adc66b8e2 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -16,6 +16,7 @@ #include "bpf_lru_list.h" #include "map_in_map.h" #include <linux/bpf_mem_alloc.h> +#include <asm/rqspinlock.h> #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ @@ -78,7 +79,7 @@ */ struct bucket { struct hlist_nulls_head head; - raw_spinlock_t raw_lock; + rqspinlock_t raw_lock; }; #define HASHTAB_MAP_LOCK_COUNT 8 @@ -104,8 +105,6 @@ struct bpf_htab { u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; - struct lock_class_key lockdep_key; - int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; }; /* each htab element is struct htab_elem + key + value */ @@ -140,45 +139,26 @@ static void htab_init_buckets(struct bpf_htab *htab) for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - raw_spin_lock_init(&htab->buckets[i].raw_lock); - lockdep_set_class(&htab->buckets[i].raw_lock, - &htab->lockdep_key); + raw_res_spin_lock_init(&htab->buckets[i].raw_lock); cond_resched(); } } -static inline int htab_lock_bucket(const struct bpf_htab *htab, - struct bucket *b, u32 hash, - unsigned long *pflags) +static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags) { unsigned long flags; + int ret; - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); - - preempt_disable(); - local_irq_save(flags); - if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { - __this_cpu_dec(*(htab->map_locked[hash])); - local_irq_restore(flags); - preempt_enable(); - return -EBUSY; - } - - raw_spin_lock(&b->raw_lock); + ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags); + if (ret) + return ret; *pflags = flags; - return 0; } -static inline void htab_unlock_bucket(const struct bpf_htab *htab, - struct bucket *b, u32 hash, - unsigned long flags) +static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags) { - hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); - raw_spin_unlock(&b->raw_lock); - __this_cpu_dec(*(htab->map_locked[hash])); - local_irq_restore(flags); - preempt_enable(); + raw_res_spin_unlock_irqrestore(&b->raw_lock, flags); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -198,12 +178,12 @@ static bool htab_is_percpu(const struct bpf_htab *htab) static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { - *(void __percpu **)(l->key + key_size) = pptr; + *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr; } static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) { - return *(void __percpu **)(l->key + key_size); + return *(void __percpu **)(l->key + roundup(key_size, 8)); } static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) @@ -483,14 +463,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; - int err, i; + int err; htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); - lockdep_register_key(&htab->lockdep_key); - bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { @@ -536,15 +514,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_elem_count; - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { - htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, - sizeof(int), - sizeof(int), - GFP_USER); - if (!htab->map_locked[i]) - goto free_map_locked; - } - if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else @@ -607,15 +576,12 @@ free_prealloc: free_map_locked: if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) - free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); bpf_mem_alloc_destroy(&htab->ma); free_elem_count: bpf_map_free_elem_count(&htab->map); free_htab: - lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); return ERR_PTR(err); } @@ -787,6 +753,9 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map, static void check_and_free_fields(struct bpf_htab *htab, struct htab_elem *elem) { + if (IS_ERR_OR_NULL(htab->map.record)) + return; + if (htab_is_percpu(htab)) { void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size); int cpu; @@ -817,7 +786,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) b = __select_bucket(htab, tgt_l->hash); head = &b->head; - ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return false; @@ -828,7 +797,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) break; } - htab_unlock_bucket(htab, b, tgt_l->hash, flags); + htab_unlock_bucket(b, flags); if (l == tgt_l) check_and_free_fields(htab, l); @@ -1147,7 +1116,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, */ } - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1198,7 +1167,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, check_and_free_fields(htab, l_old); } } - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); if (l_old) { if (old_map_ptr) map->ops->map_fd_put_ptr(map, old_map_ptr, true); @@ -1207,7 +1176,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, } return 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); return ret; } @@ -1254,7 +1223,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value copy_map_value(&htab->map, l_new->key + round_up(map->key_size, 8), value); - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) goto err_lock_bucket; @@ -1275,7 +1244,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value ret = 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); err_lock_bucket: if (ret) @@ -1312,7 +1281,7 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1337,7 +1306,7 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); return ret; } @@ -1378,7 +1347,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, return -ENOMEM; } - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) goto err_lock_bucket; @@ -1402,7 +1371,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); err_lock_bucket: if (l_new) { bpf_map_dec_elem_count(&htab->map); @@ -1444,7 +1413,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1454,7 +1423,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key) else ret = -ENOENT; - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); if (l) free_htab_elem(htab, l); @@ -1480,7 +1449,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) return ret; @@ -1491,7 +1460,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) else ret = -ENOENT; - htab_unlock_bucket(htab, b, hash, flags); + htab_unlock_bucket(b, flags); if (l) htab_lru_push_free(htab, l); return ret; @@ -1558,7 +1527,6 @@ static void htab_map_free_timers_and_wq(struct bpf_map *map) static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - int i; /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. * bpf_free_used_maps() is called after bpf prog is no longer executing. @@ -1583,9 +1551,6 @@ static void htab_map_free(struct bpf_map *map) bpf_mem_alloc_destroy(&htab->ma); if (htab->use_percpu_counter) percpu_counter_destroy(&htab->pcount); - for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) - free_percpu(htab->map_locked[i]); - lockdep_unregister_key(&htab->lockdep_key); bpf_map_area_free(htab); } @@ -1628,7 +1593,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - ret = htab_lock_bucket(htab, b, hash, &bflags); + ret = htab_lock_bucket(b, &bflags); if (ret) return ret; @@ -1665,7 +1630,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, hlist_nulls_del_rcu(&l->hash_node); out_unlock: - htab_unlock_bucket(htab, b, hash, bflags); + htab_unlock_bucket(b, bflags); if (l) { if (is_lru_map) @@ -1787,7 +1752,7 @@ again_nocopy: head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ if (locked) { - ret = htab_lock_bucket(htab, b, batch, &flags); + ret = htab_lock_bucket(b, &flags); if (ret) { rcu_read_unlock(); bpf_enable_instrumentation(); @@ -1810,7 +1775,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, batch, flags); + htab_unlock_bucket(b, flags); rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; @@ -1821,7 +1786,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, batch, flags); + htab_unlock_bucket(b, flags); rcu_read_unlock(); bpf_enable_instrumentation(); kvfree(keys); @@ -1884,7 +1849,7 @@ again_nocopy: dst_val += value_size; } - htab_unlock_bucket(htab, b, batch, flags); + htab_unlock_bucket(b, flags); locked = false; while (node_to_free) { @@ -2354,7 +2319,7 @@ static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, - offsetof(struct htab_elem, key) + map->key_size); + offsetof(struct htab_elem, key) + roundup(map->key_size, 8)); *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f27ce162427a..e3a2662f4e33 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1284,8 +1284,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u atomic_set(&t->cancelling, 0); INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); - hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); - t->timer.function = bpf_timer_cb; + hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT); cb->value = (void *)async - map->record->timer_off; break; case BPF_ASYNC_TYPE_WQ: @@ -1759,8 +1758,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, - u32, offset, u64, flags) +static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, + u32 offset, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1793,6 +1792,12 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern } } +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, + u32, offset, u64, flags) +{ + return __bpf_dynptr_read(dst, len, src, offset, flags); +} + static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, @@ -1804,8 +1809,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, - u32, len, u64, flags) +static int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, + u32 len, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1843,6 +1848,12 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v } } +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, + u32, len, u64, flags) +{ + return __bpf_dynptr_write(dst, offset, src, len, flags); +} + static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, @@ -2044,6 +2055,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_pt_regs_proto; case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); + case BPF_FUNC_perf_event_read_value: + return bpf_get_perf_event_read_value_proto(); default: return NULL; } @@ -2758,6 +2771,61 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, return 0; } +/** + * bpf_dynptr_copy() - Copy data from one dynptr to another. + * @dst_ptr: Destination dynptr - where data should be copied to + * @dst_off: Offset into the destination dynptr + * @src_ptr: Source dynptr - where data should be copied from + * @src_off: Offset into the source dynptr + * @size: Length of the data to copy from source to destination + * + * Copies data from source dynptr to destination dynptr. + * Returns 0 on success; negative error, otherwise. + */ +__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, + struct bpf_dynptr *src_ptr, u32 src_off, u32 size) +{ + struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; + struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; + void *src_slice, *dst_slice; + char buf[256]; + u32 off; + + src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); + dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); + + if (src_slice && dst_slice) { + memmove(dst_slice, src_slice, size); + return 0; + } + + if (src_slice) + return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0); + + if (dst_slice) + return __bpf_dynptr_read(dst_slice, size, src, src_off, 0); + + if (bpf_dynptr_check_off_len(dst, dst_off, size) || + bpf_dynptr_check_off_len(src, src_off, size)) + return -E2BIG; + + off = 0; + while (off < size) { + u32 chunk_sz = min_t(u32, sizeof(buf), size - off); + int err; + + err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); + if (err) + return err; + err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0); + if (err) + return err; + + off += chunk_sz; + } + return 0; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -3067,6 +3135,50 @@ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user return ret + 1; } +/** + * bpf_copy_from_user_task_str() - Copy a string from an task's address space + * @dst: Destination address, in kernel space. This buffer must be + * at least @dst__sz bytes long. + * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL. + * @unsafe_ptr__ign: Source address in the task's address space. + * @tsk: The task whose address space will be used + * @flags: The only supported flag is BPF_F_PAD_ZEROS + * + * Copies a NUL terminated string from a task's address space to @dst__sz + * buffer. If user string is too long this will still ensure zero termination + * in the @dst__sz buffer unless buffer size is 0. + * + * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success + * and memset all of @dst__sz on failure. + * + * Return: The number of copied bytes on success including the NUL terminator. + * A negative error code on failure. + */ +__bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz, + const void __user *unsafe_ptr__ign, + struct task_struct *tsk, u64 flags) +{ + int ret; + + if (unlikely(flags & ~BPF_F_PAD_ZEROS)) + return -EINVAL; + + if (unlikely(dst__sz == 0)) + return 0; + + ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0); + if (ret < 0) { + if (flags & BPF_F_PAD_ZEROS) + memset(dst, 0, dst__sz); + return ret; + } + + if (flags & BPF_F_PAD_ZEROS) + memset(dst + ret, 0, dst__sz - ret); + + return ret + 1; +} + /* Keep unsinged long in prototype so that kfunc is usable when emitted to * vmlinux.h in BPF programs directly, but note that while in BPF prog, the * unsigned long always points to 8-byte region on stack, the kernel may only @@ -3162,6 +3274,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +BTF_ID_FLAGS(func, bpf_dynptr_copy) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif @@ -3174,6 +3287,7 @@ BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_get_kmem_cache) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9aaf5124648b..dc3aa91a6ba0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,14 +150,14 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } -static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); inode->i_op = &bpf_dir_iops; inode->i_fop = &simple_dir_operations; @@ -166,7 +166,7 @@ static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, inc_nlink(dir); bpf_dentry_finalize(dentry, inode, dir); - return 0; + return NULL; } struct map_iter { diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e8a772e64324..be66d7e520e0 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -15,6 +15,7 @@ #include <net/ipv6.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> +#include <asm/rqspinlock.h> #include <linux/bpf_mem_alloc.h> /* Intermediate node */ @@ -36,7 +37,7 @@ struct lpm_trie { size_t n_entries; size_t max_prefixlen; size_t data_size; - raw_spinlock_t lock; + rqspinlock_t lock; }; /* This trie implements a longest prefix match algorithm that can be used to @@ -342,7 +343,9 @@ static long trie_update_elem(struct bpf_map *map, if (!new_node) return -ENOMEM; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); + if (ret) + goto out_free; new_node->prefixlen = key->prefixlen; RCU_INIT_POINTER(new_node->child[0], NULL); @@ -356,8 +359,7 @@ static long trie_update_elem(struct bpf_map *map, */ slot = &trie->root; - while ((node = rcu_dereference_protected(*slot, - lockdep_is_held(&trie->lock)))) { + while ((node = rcu_dereference(*slot))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -442,8 +444,8 @@ static long trie_update_elem(struct bpf_map *map, rcu_assign_pointer(*slot, im_node); out: - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); - + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); +out_free: if (ret) bpf_mem_cache_free(&trie->ma, new_node); bpf_mem_cache_free_rcu(&trie->ma, free_node); @@ -467,7 +469,9 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - raw_spin_lock_irqsave(&trie->lock, irq_flags); + ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags); + if (ret) + return ret; /* Walk the tree looking for an exact key/length match and keeping * track of the path we traverse. We will need to know the node @@ -478,8 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) trim = &trie->root; trim2 = trim; parent = NULL; - while ((node = rcu_dereference_protected( - *trim, lockdep_is_held(&trie->lock)))) { + while ((node = rcu_dereference(*trim))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -543,7 +546,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) free_node = node; out: - raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags); bpf_mem_cache_free_rcu(&trie->ma, free_parent); bpf_mem_cache_free_rcu(&trie->ma, free_node); @@ -592,7 +595,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) offsetof(struct bpf_lpm_trie_key_u8, data); trie->max_prefixlen = trie->data_size * 8; - raw_spin_lock_init(&trie->lock); + raw_res_spin_lock_init(&trie->lock); /* Allocate intermediate and leaf nodes from the same allocator */ leaf_size = sizeof(struct lpm_trie_node) + trie->data_size + diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 1a4fec330eaa..42ae8d595c2c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -25,6 +25,7 @@ #include <linux/rhashtable.h> #include <linux/rtnetlink.h> #include <linux/rwsem.h> +#include <net/netdev_lock.h> #include <net/xdp.h> /* Protects offdevs, members of bpf_offload_netdev and offload members @@ -528,13 +529,14 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&offmap->map, attr); - rtnl_lock(); - down_write(&bpf_devs_lock); offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); err = bpf_dev_offload_check(offmap->netdev); if (err) - goto err_unlock; + goto err_unlock_rtnl; + + netdev_lock_ops(offmap->netdev); + down_write(&bpf_devs_lock); ondev = bpf_offload_find_netdev(offmap->netdev); if (!ondev) { @@ -548,12 +550,15 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) list_add_tail(&offmap->offloads, &ondev->maps); up_write(&bpf_devs_lock); + netdev_unlock_ops(offmap->netdev); rtnl_unlock(); return &offmap->map; err_unlock: up_write(&bpf_devs_lock); + netdev_unlock_ops(offmap->netdev); +err_unlock_rtnl: rtnl_unlock(); bpf_map_area_free(offmap); return ERR_PTR(err); diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 034cf87b54e9..632762b57299 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -14,11 +14,9 @@ int pcpu_freelist_init(struct pcpu_freelist *s) for_each_possible_cpu(cpu) { struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); - raw_spin_lock_init(&head->lock); + raw_res_spin_lock_init(&head->lock); head->first = NULL; } - raw_spin_lock_init(&s->extralist.lock); - s->extralist.first = NULL; return 0; } @@ -34,58 +32,39 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head, WRITE_ONCE(head->first, node); } -static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, +static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head, struct pcpu_freelist_node *node) { - raw_spin_lock(&head->lock); - pcpu_freelist_push_node(head, node); - raw_spin_unlock(&head->lock); -} - -static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s, - struct pcpu_freelist_node *node) -{ - if (!raw_spin_trylock(&s->extralist.lock)) + if (raw_res_spin_lock(&head->lock)) return false; - - pcpu_freelist_push_node(&s->extralist, node); - raw_spin_unlock(&s->extralist.lock); + pcpu_freelist_push_node(head, node); + raw_res_spin_unlock(&head->lock); return true; } -static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s, - struct pcpu_freelist_node *node) +void __pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) { - int cpu, orig_cpu; + struct pcpu_freelist_head *head; + int cpu; - orig_cpu = raw_smp_processor_id(); - while (1) { - for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) { - struct pcpu_freelist_head *head; + if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node)) + return; + while (true) { + for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { + if (cpu == raw_smp_processor_id()) + continue; head = per_cpu_ptr(s->freelist, cpu); - if (raw_spin_trylock(&head->lock)) { - pcpu_freelist_push_node(head, node); - raw_spin_unlock(&head->lock); - return; - } - } - - /* cannot lock any per cpu lock, try extralist */ - if (pcpu_freelist_try_push_extra(s, node)) + if (raw_res_spin_lock(&head->lock)) + continue; + pcpu_freelist_push_node(head, node); + raw_res_spin_unlock(&head->lock); return; + } } } -void __pcpu_freelist_push(struct pcpu_freelist *s, - struct pcpu_freelist_node *node) -{ - if (in_nmi()) - ___pcpu_freelist_push_nmi(s, node); - else - ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node); -} - void pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { @@ -120,71 +99,29 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s) { + struct pcpu_freelist_node *node = NULL; struct pcpu_freelist_head *head; - struct pcpu_freelist_node *node; int cpu; for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { head = per_cpu_ptr(s->freelist, cpu); if (!READ_ONCE(head->first)) continue; - raw_spin_lock(&head->lock); + if (raw_res_spin_lock(&head->lock)) + continue; node = head->first; if (node) { WRITE_ONCE(head->first, node->next); - raw_spin_unlock(&head->lock); + raw_res_spin_unlock(&head->lock); return node; } - raw_spin_unlock(&head->lock); + raw_res_spin_unlock(&head->lock); } - - /* per cpu lists are all empty, try extralist */ - if (!READ_ONCE(s->extralist.first)) - return NULL; - raw_spin_lock(&s->extralist.lock); - node = s->extralist.first; - if (node) - WRITE_ONCE(s->extralist.first, node->next); - raw_spin_unlock(&s->extralist.lock); - return node; -} - -static struct pcpu_freelist_node * -___pcpu_freelist_pop_nmi(struct pcpu_freelist *s) -{ - struct pcpu_freelist_head *head; - struct pcpu_freelist_node *node; - int cpu; - - for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) { - head = per_cpu_ptr(s->freelist, cpu); - if (!READ_ONCE(head->first)) - continue; - if (raw_spin_trylock(&head->lock)) { - node = head->first; - if (node) { - WRITE_ONCE(head->first, node->next); - raw_spin_unlock(&head->lock); - return node; - } - raw_spin_unlock(&head->lock); - } - } - - /* cannot pop from per cpu lists, try extralist */ - if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock)) - return NULL; - node = s->extralist.first; - if (node) - WRITE_ONCE(s->extralist.first, node->next); - raw_spin_unlock(&s->extralist.lock); return node; } struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { - if (in_nmi()) - return ___pcpu_freelist_pop_nmi(s); return ___pcpu_freelist_pop(s); } diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index 3c76553cfe57..914798b74967 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -5,15 +5,15 @@ #define __PERCPU_FREELIST_H__ #include <linux/spinlock.h> #include <linux/percpu.h> +#include <asm/rqspinlock.h> struct pcpu_freelist_head { struct pcpu_freelist_node *first; - raw_spinlock_t lock; + rqspinlock_t lock; }; struct pcpu_freelist { struct pcpu_freelist_head __percpu *freelist; - struct pcpu_freelist_head extralist; }; struct pcpu_freelist_node { diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 0c63bc2cd895..2fdf3c978db1 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -90,3 +90,4 @@ static void __exit fini(void) late_initcall(load); module_exit(fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs"); diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index e1cfe890e0be..1499d8caa9a3 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -268,8 +268,6 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma /* allow writable mapping for the consumer_pos only */ if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) return -EPERM; - } else { - vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, @@ -289,8 +287,6 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma * position, and the ring buffer data itself. */ return -EPERM; - } else { - vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c new file mode 100644 index 000000000000..b896c4a75a5c --- /dev/null +++ b/kernel/bpf/rqspinlock.c @@ -0,0 +1,737 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Resilient Queued Spin Lock + * + * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. + * (C) Copyright 2013-2014,2018 Red Hat, Inc. + * (C) Copyright 2015 Intel Corp. + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. + * + * Authors: Waiman Long <longman@redhat.com> + * Peter Zijlstra <peterz@infradead.org> + * Kumar Kartikeya Dwivedi <memxor@gmail.com> + */ + +#include <linux/smp.h> +#include <linux/bug.h> +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/mutex.h> +#include <linux/prefetch.h> +#include <asm/byteorder.h> +#ifdef CONFIG_QUEUED_SPINLOCKS +#include <asm/qspinlock.h> +#endif +#include <trace/events/lock.h> +#include <asm/rqspinlock.h> +#include <linux/timekeeping.h> + +/* + * Include queued spinlock definitions and statistics code + */ +#ifdef CONFIG_QUEUED_SPINLOCKS +#include "../locking/qspinlock.h" +#include "../locking/lock_events.h" +#include "rqspinlock.h" +#include "../locking/mcs_spinlock.h" +#endif + +/* + * The basic principle of a queue-based spinlock can best be understood + * by studying a classic queue-based spinlock implementation called the + * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable + * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and + * Scott") is available at + * + * https://bugzilla.kernel.org/show_bug.cgi?id=206115 + * + * This queued spinlock implementation is based on the MCS lock, however to + * make it fit the 4 bytes we assume spinlock_t to be, and preserve its + * existing API, we must modify it somehow. + * + * In particular; where the traditional MCS lock consists of a tail pointer + * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to + * unlock the next pending (next->locked), we compress both these: {tail, + * next->locked} into a single u32 value. + * + * Since a spinlock disables recursion of its own context and there is a limit + * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there + * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now + * we can encode the tail by combining the 2-bit nesting level with the cpu + * number. With one byte for the lock value and 3 bytes for the tail, only a + * 32-bit word is now needed. Even though we only need 1 bit for the lock, + * we extend it to a full byte to achieve better performance for architectures + * that support atomic byte write. + * + * We also change the first spinner to spin on the lock bit instead of its + * node; whereby avoiding the need to carry a node from lock to unlock, and + * preserving existing lock API. This also makes the unlock code simpler and + * faster. + * + * N.B. The current implementation only supports architectures that allow + * atomic operations on smaller 8-bit and 16-bit data types. + * + */ + +struct rqspinlock_timeout { + u64 timeout_end; + u64 duration; + u64 cur; + u16 spin; +}; + +#define RES_TIMEOUT_VAL 2 + +DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks); +EXPORT_SYMBOL_GPL(rqspinlock_held_locks); + +static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts) +{ + if (!(atomic_read_acquire(&lock->val) & (mask))) + return true; + return false; +} + +static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + int cnt = min(RES_NR_HELD, rqh->cnt); + + /* + * Return an error if we hold the lock we are attempting to acquire. + * We'll iterate over max 32 locks; no need to do is_lock_released. + */ + for (int i = 0; i < cnt - 1; i++) { + if (rqh->locks[i] == lock) + return -EDEADLK; + } + return 0; +} + +/* + * This focuses on the most common case of ABBA deadlocks (or ABBA involving + * more locks, which reduce to ABBA). This is not exhaustive, and we rely on + * timeouts as the final line of defense. + */ +static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + int rqh_cnt = min(RES_NR_HELD, rqh->cnt); + void *remote_lock; + int cpu; + + /* + * Find the CPU holding the lock that we want to acquire. If there is a + * deadlock scenario, we will read a stable set on the remote CPU and + * find the target. This would be a constant time operation instead of + * O(NR_CPUS) if we could determine the owning CPU from a lock value, but + * that requires increasing the size of the lock word. + */ + for_each_possible_cpu(cpu) { + struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu); + int real_cnt = READ_ONCE(rqh_cpu->cnt); + int cnt = min(RES_NR_HELD, real_cnt); + + /* + * Let's ensure to break out of this loop if the lock is available for + * us to potentially acquire. + */ + if (is_lock_released(lock, mask, ts)) + return 0; + + /* + * Skip ourselves, and CPUs whose count is less than 2, as they need at + * least one held lock and one acquisition attempt (reflected as top + * most entry) to participate in an ABBA deadlock. + * + * If cnt is more than RES_NR_HELD, it means the current lock being + * acquired won't appear in the table, and other locks in the table are + * already held, so we can't determine ABBA. + */ + if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD) + continue; + + /* + * Obtain the entry at the top, this corresponds to the lock the + * remote CPU is attempting to acquire in a deadlock situation, + * and would be one of the locks we hold on the current CPU. + */ + remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]); + /* + * If it is NULL, we've raced and cannot determine a deadlock + * conclusively, skip this CPU. + */ + if (!remote_lock) + continue; + /* + * Find if the lock we're attempting to acquire is held by this CPU. + * Don't consider the topmost entry, as that must be the latest lock + * being held or acquired. For a deadlock, the target CPU must also + * attempt to acquire a lock we hold, so for this search only 'cnt - 1' + * entries are important. + */ + for (int i = 0; i < cnt - 1; i++) { + if (READ_ONCE(rqh_cpu->locks[i]) != lock) + continue; + /* + * We found our lock as held on the remote CPU. Is the + * acquisition attempt on the remote CPU for a lock held + * by us? If so, we have a deadlock situation, and need + * to recover. + */ + for (int i = 0; i < rqh_cnt - 1; i++) { + if (rqh->locks[i] == remote_lock) + return -EDEADLK; + } + /* + * Inconclusive; retry again later. + */ + return 0; + } + } + return 0; +} + +static noinline int check_deadlock(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + int ret; + + ret = check_deadlock_AA(lock, mask, ts); + if (ret) + return ret; + ret = check_deadlock_ABBA(lock, mask, ts); + if (ret) + return ret; + + return 0; +} + +static noinline int check_timeout(rqspinlock_t *lock, u32 mask, + struct rqspinlock_timeout *ts) +{ + u64 time = ktime_get_mono_fast_ns(); + u64 prev = ts->cur; + + if (!ts->timeout_end) { + ts->cur = time; + ts->timeout_end = time + ts->duration; + return 0; + } + + if (time > ts->timeout_end) + return -ETIMEDOUT; + + /* + * A millisecond interval passed from last time? Trigger deadlock + * checks. + */ + if (prev + NSEC_PER_MSEC < time) { + ts->cur = time; + return check_deadlock(lock, mask, ts); + } + + return 0; +} + +/* + * Do not amortize with spins when res_smp_cond_load_acquire is defined, + * as the macro does internal amortization for us. + */ +#ifndef res_smp_cond_load_acquire +#define RES_CHECK_TIMEOUT(ts, ret, mask) \ + ({ \ + if (!(ts).spin++) \ + (ret) = check_timeout((lock), (mask), &(ts)); \ + (ret); \ + }) +#else +#define RES_CHECK_TIMEOUT(ts, ret, mask) \ + ({ (ret) = check_timeout(&(ts)); }) +#endif + +/* + * Initialize the 'spin' member. + * Set spin member to 0 to trigger AA/ABBA checks immediately. + */ +#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; }) + +/* + * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary. + * Duration is defined for each spin attempt, so set it here. + */ +#define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; }) + +/* + * Provide a test-and-set fallback for cases when queued spin lock support is + * absent from the architecture. + */ +int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock) +{ + struct rqspinlock_timeout ts; + int val, ret = 0; + + RES_INIT_TIMEOUT(ts); + grab_held_lock_entry(lock); + + /* + * Since the waiting loop's time is dependent on the amount of + * contention, a short timeout unlike rqspinlock waiting loops + * isn't enough. Choose a second as the timeout value. + */ + RES_RESET_TIMEOUT(ts, NSEC_PER_SEC); +retry: + val = atomic_read(&lock->val); + + if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) { + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) + goto out; + cpu_relax(); + goto retry; + } + + return 0; +out: + release_held_lock_entry(); + return ret; +} +EXPORT_SYMBOL_GPL(resilient_tas_spin_lock); + +#ifdef CONFIG_QUEUED_SPINLOCKS + +/* + * Per-CPU queue node structures; we can never have more than 4 nested + * contexts: task, softirq, hardirq, nmi. + * + * Exactly fits one 64-byte cacheline on a 64-bit architecture. + */ +static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]); + +#ifndef res_smp_cond_load_acquire +#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c) +#endif + +#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c)) + +/** + * resilient_queued_spin_lock_slowpath - acquire the queued spinlock + * @lock: Pointer to queued spinlock structure + * @val: Current value of the queued spinlock 32-bit word + * + * Return: + * * 0 - Lock was acquired successfully. + * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock. + * * -ETIMEDOUT - Lock acquisition failed because of timeout. + * + * (queue tail, pending bit, lock value) + * + * fast : slow : unlock + * : : + * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0) + * : | ^--------.------. / : + * : v \ \ | : + * pending : (0,1,1) +--> (0,1,0) \ | : + * : | ^--' | | : + * : v | | : + * uncontended : (n,x,y) +--> (n,0,0) --' | : + * queue : | ^--' | : + * : v | : + * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : + * queue : ^--' : + */ +int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val) +{ + struct mcs_spinlock *prev, *next, *node; + struct rqspinlock_timeout ts; + int idx, ret = 0; + u32 old, tail; + + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); + + if (resilient_virt_spin_lock_enabled()) + return resilient_virt_spin_lock(lock); + + RES_INIT_TIMEOUT(ts); + + /* + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. + * + * 0,1,0 -> 0,0,1 + */ + if (val == _Q_PENDING_VAL) { + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--); + } + + /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* + * trylock || pending + * + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock + */ + val = queued_fetch_set_pending_acquire(lock); + + /* + * If we observe contention, there is a concurrent locker. + * + * Undo and queue; our setting of PENDING might have made the + * n,0,0 -> 0,0,0 transition fail and it will now be waiting + * on @next to become !NULL. + */ + if (unlikely(val & ~_Q_LOCKED_MASK)) { + + /* Undo PENDING if we set it. */ + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); + + goto queue; + } + + /* + * Grab an entry in the held locks array, to enable deadlock detection. + */ + grab_held_lock_entry(lock); + + /* + * We're pending, wait for the owner to go away. + * + * 0,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. + */ + if (val & _Q_LOCKED_MASK) { + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); + res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK)); + } + + if (ret) { + /* + * We waited for the locked bit to go back to 0, as the pending + * waiter, but timed out. We need to clear the pending bit since + * we own it. Once a stuck owner has been recovered, the lock + * must be restored to a valid state, hence removing the pending + * bit is necessary. + * + * *,1,* -> *,0,* + */ + clear_pending(lock); + lockevent_inc(rqspinlock_lock_timeout); + goto err_release_entry; + } + + /* + * take ownership and clear the pending bit. + * + * 0,1,0 -> 0,0,1 + */ + clear_pending_set_locked(lock); + lockevent_inc(lock_pending); + return 0; + + /* + * End of pending bit optimistic spinning and beginning of MCS + * queuing. + */ +queue: + lockevent_inc(lock_slowpath); + /* + * Grab deadlock detection entry for the queue path. + */ + grab_held_lock_entry(lock); + + node = this_cpu_ptr(&rqnodes[0].mcs); + idx = node->count++; + tail = encode_tail(smp_processor_id(), idx); + + trace_contention_begin(lock, LCB_F_SPIN); + + /* + * 4 nodes are allocated based on the assumption that there will + * not be nested NMIs taking spinlocks. That may not be true in + * some architectures even though the chance of needing more than + * 4 nodes will still be extremely unlikely. When that happens, + * we fall back to spinning on the lock directly without using + * any MCS node. This is not the most elegant solution, but is + * simple enough. + */ + if (unlikely(idx >= _Q_MAX_NODES)) { + lockevent_inc(lock_no_node); + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); + while (!queued_spin_trylock(lock)) { + if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) { + lockevent_inc(rqspinlock_lock_timeout); + goto err_release_node; + } + cpu_relax(); + } + goto release; + } + + node = grab_mcs_node(node, idx); + + /* + * Keep counts of non-zero index values: + */ + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + + node->locked = 0; + node->next = NULL; + + /* + * We touched a (possibly) cold cacheline in the per-cpu queue node; + * attempt the trylock once more in the hope someone let go while we + * weren't watching. + */ + if (queued_spin_trylock(lock)) + goto release; + + /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. + * We have already touched the queueing cacheline; don't bother with + * pending stuff. + * + * p,*,* -> n,*,* + */ + old = xchg_tail(lock, tail); + next = NULL; + + /* + * if there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_MASK) { + int val; + + prev = decode_tail(old, rqnodes); + + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); + + val = arch_mcs_spin_lock_contended(&node->locked); + if (val == RES_TIMEOUT_VAL) { + ret = -EDEADLK; + goto waitq_timeout; + } + + /* + * While waiting for the MCS lock, the next pointer may have + * been set by another lock waiter. We optimistically load + * the next pointer & prefetch the cacheline for writing + * to reduce latency in the upcoming MCS unlock operation. + */ + next = READ_ONCE(node->next); + if (next) + prefetchw(next); + } + + /* + * we're at the head of the waitqueue, wait for the owner & pending to + * go away. + * + * *,x,y -> *,0,0 + * + * this wait loop must use a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because the set_locked() function below + * does not imply a full barrier. + * + * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is + * meant to span maximum allowed time per critical section, and we may + * have both the owner of the lock and the pending bit waiter ahead of + * us. + */ + RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2); + val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) || + RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK)); + +waitq_timeout: + if (ret) { + /* + * If the tail is still pointing to us, then we are the final waiter, + * and are responsible for resetting the tail back to 0. Otherwise, if + * the cmpxchg operation fails, we signal the next waiter to take exit + * and try the same. For a waiter with tail node 'n': + * + * n,*,* -> 0,*,* + * + * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is + * possible locked/pending bits keep changing and we see failures even + * when we remain the head of wait queue. However, eventually, + * pending bit owner will unset the pending bit, and new waiters + * will queue behind us. This will leave the lock owner in + * charge, and it will eventually either set locked bit to 0, or + * leave it as 1, allowing us to make progress. + * + * We terminate the whole wait queue for two reasons. Firstly, + * we eschew per-waiter timeouts with one applied at the head of + * the wait queue. This allows everyone to break out faster + * once we've seen the owner / pending waiter not responding for + * the timeout duration from the head. Secondly, it avoids + * complicated synchronization, because when not leaving in FIFO + * order, prev's next pointer needs to be fixed up etc. + */ + if (!try_cmpxchg_tail(lock, tail, 0)) { + next = smp_cond_load_relaxed(&node->next, VAL); + WRITE_ONCE(next->locked, RES_TIMEOUT_VAL); + } + lockevent_inc(rqspinlock_lock_timeout); + goto err_release_node; + } + + /* + * claim the lock: + * + * n,0,0 -> 0,0,1 : lock, uncontended + * *,*,0 -> *,*,1 : lock, contended + * + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. + */ + + /* + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the + * above wait condition, therefore any concurrent setting of + * PENDING will make the uncontended transition fail. + */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + } + + /* + * Either somebody is queued behind us or _Q_PENDING_VAL got set + * which will then detect the remaining tail and queue behind us + * ensuring we'll see a @next. + */ + set_locked(lock); + + /* + * contended path; wait for next if not observed yet, release. + */ + if (!next) + next = smp_cond_load_relaxed(&node->next, (VAL)); + + arch_mcs_spin_unlock_contended(&next->locked); + +release: + trace_contention_end(lock, 0); + + /* + * release the node + */ + __this_cpu_dec(rqnodes[0].mcs.count); + return ret; +err_release_node: + trace_contention_end(lock, ret); + __this_cpu_dec(rqnodes[0].mcs.count); +err_release_entry: + release_held_lock_entry(); + return ret; +} +EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); + +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) +{ + int ret; + + BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); + BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); + + preempt_disable(); + ret = res_spin_lock((rqspinlock_t *)lock); + if (unlikely(ret)) { + preempt_enable(); + return ret; + } + return 0; +} + +__bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock) +{ + res_spin_unlock((rqspinlock_t *)lock); + preempt_enable(); +} + +__bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) +{ + u64 *ptr = (u64 *)flags__irq_flag; + unsigned long flags; + int ret; + + preempt_disable(); + local_irq_save(flags); + ret = res_spin_lock((rqspinlock_t *)lock); + if (unlikely(ret)) { + local_irq_restore(flags); + preempt_enable(); + return ret; + } + *ptr = flags; + return 0; +} + +__bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag) +{ + u64 *ptr = (u64 *)flags__irq_flag; + unsigned long flags = *ptr; + + res_spin_unlock((rqspinlock_t *)lock); + local_irq_restore(flags); + preempt_enable(); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(rqspinlock_kfunc_ids) +BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_res_spin_unlock) +BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore) +BTF_KFUNCS_END(rqspinlock_kfunc_ids) + +static const struct btf_kfunc_id_set rqspinlock_kfunc_set = { + .owner = THIS_MODULE, + .set = &rqspinlock_kfunc_ids, +}; + +static __init int rqspinlock_register_kfuncs(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set); +} +late_initcall(rqspinlock_register_kfuncs); diff --git a/kernel/bpf/rqspinlock.h b/kernel/bpf/rqspinlock.h new file mode 100644 index 000000000000..5d8cb1b1aab4 --- /dev/null +++ b/kernel/bpf/rqspinlock.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Resilient Queued Spin Lock defines + * + * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. + * + * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> + */ +#ifndef __LINUX_RQSPINLOCK_H +#define __LINUX_RQSPINLOCK_H + +#include "../locking/qspinlock.h" + +/* + * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value + * @lock: Pointer to queued spinlock structure + * @tail: The tail to compare against + * @new_tail: The new queue tail code word + * Return: Bool to indicate whether the cmpxchg operation succeeded + * + * This is used by the head of the wait queue to clean up the queue. + * Provides relaxed ordering, since observers only rely on initialized + * state of the node which was made visible through the xchg_tail operation, + * i.e. through the smp_wmb preceding xchg_tail. + * + * We avoid using 16-bit cmpxchg, which is not available on all architectures. + */ +static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + /* + * Is the tail part we compare to already stale? Fail. + */ + if ((old & _Q_TAIL_MASK) != tail) + return false; + /* + * Encode latest locked/pending state for new tail. + */ + new = (old & _Q_LOCKED_PENDING_MASK) | new_tail; + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return true; +} + +#endif /* __LINUX_RQSPINLOCK_H */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c420edbfb7c8..9794446bc8c6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, +static bool can_alloc_pages(void) +{ + return preempt_count() == 0 && !irqs_disabled() && + !IS_ENABLED(CONFIG_PREEMPT_RT); +} + +static struct page *__bpf_alloc_page(int nid) +{ + if (!can_alloc_pages()) + return try_alloc_pages(nid, 0); + + return alloc_pages_node(nid, + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT + | __GFP_NOWARN, + 0); +} + +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **pages) { unsigned long i, j; @@ -582,14 +599,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, old_memcg = set_active_memcg(memcg); #endif for (i = 0; i < nr_pages; i++) { - pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); + pg = __bpf_alloc_page(nid); if (pg) { pages[i] = pg; continue; } for (j = 0; j < i; j++) - __free_page(pages[j]); + free_pages_nolock(pages[j], 0); ret = -ENOMEM; break; } @@ -648,6 +665,7 @@ void btf_record_free(struct btf_record *rec) case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: @@ -700,6 +718,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_RB_ROOT: case BPF_RB_NODE: case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: @@ -777,6 +796,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) switch (fields[i].type) { case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: break; case BPF_TIMER: bpf_timer_cancel_and_free(field_ptr); @@ -1035,7 +1055,7 @@ static const struct vm_operations_struct bpf_map_default_vmops = { static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) { struct bpf_map *map = filp->private_data; - int err; + int err = 0; if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; @@ -1059,24 +1079,33 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) err = -EACCES; goto out; } + bpf_map_write_active_inc(map); } +out: + mutex_unlock(&map->freeze_mutex); + if (err) + return err; /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; vm_flags_clear(vma, VM_MAYEXEC); + /* If mapping is read-only, then disallow potentially re-mapping with + * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing + * means that as far as BPF map's memory-mapped VMAs are concerned, + * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set, + * both should be set, so we can forget about VM_MAYWRITE and always + * check just VM_WRITE + */ if (!(vma->vm_flags & VM_WRITE)) - /* disallow re-mapping with PROT_WRITE */ vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); - if (err) - goto out; + if (err) { + if (vma->vm_flags & VM_WRITE) + bpf_map_write_active_dec(map); + } - if (vma->vm_flags & VM_MAYWRITE) - bpf_map_write_active_inc(map); -out: - mutex_unlock(&map->freeze_mutex); return err; } @@ -1203,7 +1232,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, return -EINVAL; map->record = btf_parse_fields(btf, value_type, - BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | + BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { @@ -1222,6 +1251,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, case 0: continue; case BPF_SPIN_LOCK: + case BPF_RES_SPIN_LOCK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && @@ -1306,7 +1336,7 @@ static bool bpf_net_capable(void) #define BPF_MAP_CREATE_LAST_FIELD map_token_fd /* called via syscall */ -static int map_create(union bpf_attr *attr) +static int map_create(union bpf_attr *attr, bool kernel) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1496,7 +1526,7 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token); + err = security_bpf_map_create(map, attr, token, kernel); if (err) goto free_map_sec; @@ -1584,11 +1614,8 @@ struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { - spin_lock_bh(&map_idr_lock); - map = __bpf_map_inc_not_zero(map, false); - spin_unlock_bh(&map_idr_lock); - - return map; + lockdep_assert(rcu_read_lock_held()); + return __bpf_map_inc_not_zero(map, false); } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); @@ -1968,8 +1995,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, return err; } -#define MAP_LOOKUP_RETRIES 3 - int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -1979,8 +2004,8 @@ int generic_map_lookup_batch(struct bpf_map *map, void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); void *buf, *buf_prevkey, *prev_key, *key, *value; - int err, retry = MAP_LOOKUP_RETRIES; u32 value_size, cp, max_count; + int err; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; @@ -2026,14 +2051,8 @@ int generic_map_lookup_batch(struct bpf_map *map, err = bpf_map_copy_value(map, key, value, attr->batch.elem_flags); - if (err == -ENOENT) { - if (retry) { - retry--; - continue; - } - err = -EINTR; - break; - } + if (err == -ENOENT) + goto next_key; if (err) goto free_buf; @@ -2048,12 +2067,12 @@ int generic_map_lookup_batch(struct bpf_map *map, goto free_buf; } + cp++; +next_key: if (!prev_key) prev_key = buf_prevkey; swap(prev_key, key); - retry = MAP_LOOKUP_RETRIES; - cp++; cond_resched(); } @@ -2313,6 +2332,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); kfree(prog->aux->kfunc_tab); + kfree(prog->aux->ctx_arg_info); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); @@ -2943,7 +2963,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_prog; - err = security_bpf_prog_load(prog, attr, token); + err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); if (err) goto free_prog_sec; @@ -4168,7 +4188,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_F_ATTACH_MASK_BASE \ (BPF_F_ALLOW_OVERRIDE | \ BPF_F_ALLOW_MULTI | \ - BPF_F_REPLACE) + BPF_F_REPLACE | \ + BPF_F_PREORDER) #define BPF_F_ATTACH_MASK_MPROG \ (BPF_F_REPLACE | \ @@ -4732,6 +4753,8 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.recursion_misses = stats.misses; info.verified_insns = prog->aux->verified_insns; + if (prog->aux->btf) + info.btf_id = btf_obj_id(prog->aux->btf); if (!bpf_capable()) { info.jited_prog_len = 0; @@ -4878,8 +4901,6 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } - if (prog->aux->btf) - info.btf_id = btf_obj_id(prog->aux->btf); info.attach_btf_id = prog->aux->attach_btf_id; if (attach_btf) info.attach_btf_obj_id = btf_obj_id(attach_btf); @@ -5120,15 +5141,34 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_ return btf_new_fd(attr, uattr, uattr_size); } -#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id +#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) { + struct bpf_token *token = NULL; + if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (attr->open_flags & ~BPF_F_TOKEN_FD) + return -EINVAL; + + if (attr->open_flags & BPF_F_TOKEN_FD) { + token = bpf_token_get_from_fd(attr->fd_by_id_token_fd); + if (IS_ERR(token)) + return PTR_ERR(token); + if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) { + bpf_token_put(token); + token = NULL; + } + } + + if (!bpf_token_capable(token, CAP_SYS_ADMIN)) { + bpf_token_put(token); return -EPERM; + } + + bpf_token_put(token); return btf_get_fd_by_id(attr->btf_id); } @@ -5767,13 +5807,13 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; - err = security_bpf(cmd, &attr, size); + err = security_bpf(cmd, &attr, size, uattr.is_kernel); if (err < 0) return err; switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr); + err = map_create(&attr, uattr.is_kernel); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9971c03adfd5..54c6953a8b84 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -456,7 +456,7 @@ static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog) static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { - return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); + return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK); } static bool type_is_rdonly_mem(u32 type) @@ -579,6 +579,13 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) insn->imm == BPF_CMPXCHG; } +static bool is_atomic_load_insn(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ; +} + static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; @@ -1148,7 +1155,8 @@ static int release_irq_state(struct bpf_verifier_state *state, int id); static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta, - struct bpf_reg_state *reg, int insn_idx) + struct bpf_reg_state *reg, int insn_idx, + int kfunc_class) { struct bpf_func_state *state = func(env, reg); struct bpf_stack_state *slot; @@ -1170,6 +1178,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = id; + st->irq.kfunc_class = kfunc_class; for (i = 0; i < BPF_REG_SIZE; i++) slot->slot_type[i] = STACK_IRQ_FLAG; @@ -1178,7 +1187,8 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, return 0; } -static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int kfunc_class) { struct bpf_func_state *state = func(env, reg); struct bpf_stack_state *slot; @@ -1192,6 +1202,15 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r slot = &state->stack[spi]; st = &slot->spilled_ptr; + if (st->irq.kfunc_class != kfunc_class) { + const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; + const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock"; + + verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n", + flag_kfunc, used_kfunc); + return -EINVAL; + } + err = release_irq_state(env->cur_state, st->ref_obj_id); WARN_ON_ONCE(err && err != -EACCES); if (err) { @@ -1409,6 +1428,8 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf dst->active_preempt_locks = src->active_preempt_locks; dst->active_rcu_lock = src->active_rcu_lock; dst->active_irq_id = src->active_irq_id; + dst->active_lock_id = src->active_lock_id; + dst->active_lock_ptr = src->active_lock_ptr; return 0; } @@ -1501,11 +1522,15 @@ static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum r struct bpf_reference_state *s; s = acquire_reference_state(env, insn_idx); + if (!s) + return -ENOMEM; s->type = type; s->id = id; s->ptr = ptr; state->active_locks++; + state->active_lock_id = id; + state->active_lock_ptr = ptr; return 0; } @@ -1543,18 +1568,37 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx) return; } +static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id) +{ + int i; + + for (i = 0; i < state->acquired_refs; i++) + if (state->refs[i].id == ptr_id) + return true; + + return false; +} + static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) { + void *prev_ptr = NULL; + u32 prev_id = 0; int i; for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].type != type) - continue; - if (state->refs[i].id == id && state->refs[i].ptr == ptr) { + if (state->refs[i].type == type && state->refs[i].id == id && + state->refs[i].ptr == ptr) { release_reference_state(state, i); state->active_locks--; + /* Reassign active lock (id, ptr). */ + state->active_lock_id = prev_id; + state->active_lock_ptr = prev_ptr; return 0; } + if (state->refs[i].type & REF_TYPE_LOCK_MASK) { + prev_id = state->refs[i].id; + prev_ptr = state->refs[i].ptr; + } } return -EINVAL; } @@ -1589,7 +1633,7 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st for (i = 0; i < state->acquired_refs; i++) { struct bpf_reference_state *s = &state->refs[i]; - if (s->type != type) + if (!(s->type & type)) continue; if (s->id == id && s->ptr == ptr) @@ -1598,6 +1642,14 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st return NULL; } +static void update_peak_states(struct bpf_verifier_env *env) +{ + u32 cur_states; + + cur_states = env->explored_states_size + env->free_list_size; + env->peak_states = max(env->peak_states, cur_states); +} + static void free_func_state(struct bpf_func_state *state) { if (!state) @@ -1620,6 +1672,50 @@ static void free_verifier_state(struct bpf_verifier_state *state, kfree(state); } +/* struct bpf_verifier_state->{parent,loop_entry} refer to states + * that are in either of env->{expored_states,free_list}. + * In both cases the state is contained in struct bpf_verifier_state_list. + */ +static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st) +{ + if (st->parent) + return container_of(st->parent, struct bpf_verifier_state_list, state); + return NULL; +} + +static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st) +{ + if (st->loop_entry) + return container_of(st->loop_entry, struct bpf_verifier_state_list, state); + return NULL; +} + +/* A state can be freed if it is no longer referenced: + * - is in the env->free_list; + * - has no children states; + * - is not used as loop_entry. + * + * Freeing a state can make it's loop_entry free-able. + */ +static void maybe_free_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state_list *sl) +{ + struct bpf_verifier_state_list *loop_entry_sl; + + while (sl && sl->in_free_list && + sl->state.branches == 0 && + sl->state.used_as_loop_entry == 0) { + loop_entry_sl = state_loop_entry_as_list(&sl->state); + if (loop_entry_sl) + loop_entry_sl->state.used_as_loop_entry--; + list_del(&sl->node); + free_verifier_state(&sl->state, false); + kfree(sl); + env->free_list_size--; + sl = loop_entry_sl; + } +} + /* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack */ @@ -1659,6 +1755,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; dst_state->may_goto_depth = src->may_goto_depth; + dst_state->loop_entry = src->loop_entry; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -1679,7 +1776,7 @@ static u32 state_htab_size(struct bpf_verifier_env *env) return env->prog->len; } -static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx) +static struct list_head *explored_state(struct bpf_verifier_env *env, int idx) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_func_state *state = cur->frame[cur->curframe]; @@ -1787,16 +1884,13 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta * # Find outermost loop entry known for n * def get_loop_entry(n): * h = entries.get(n, None) - * while h in entries and entries[h] != h: + * while h in entries: * h = entries[h] * return h * - * # Update n's loop entry if h's outermost entry comes - * # before n's outermost entry in current DFS path. + * # Update n's loop entry if h comes before n in current DFS path. * def update_loop_entry(n, h): - * n1 = get_loop_entry(n) or n - * h1 = get_loop_entry(h) or h - * if h1 in path and depths[h1] <= depths[n1]: + * if h in path and depths[entries.get(n, n)] < depths[n]: * entries[n] = h1 * * def dfs(n, depth): @@ -1808,7 +1902,7 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta * # Case A: explore succ and update cur's loop entry * # only if succ's entry is in current DFS path. * dfs(succ, depth + 1) - * h = get_loop_entry(succ) + * h = entries.get(succ, None) * update_loop_entry(n, h) * else: * # Case B or C depending on `h1 in path` check in update_loop_entry(). @@ -1820,46 +1914,49 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta * and cur's loop entry has to be updated (case A), handle this in * update_branch_counts(); * - use st->branch > 0 as a signal that st is in the current DFS path; - * - handle cases B and C in is_state_visited(); - * - update topmost loop entry for intermediate states in get_loop_entry(). + * - handle cases B and C in is_state_visited(). */ -static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st) +static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) { - struct bpf_verifier_state *topmost = st->loop_entry, *old; + struct bpf_verifier_state *topmost = st->loop_entry; + u32 steps = 0; - while (topmost && topmost->loop_entry && topmost != topmost->loop_entry) + while (topmost && topmost->loop_entry) { + if (steps++ > st->dfs_depth) { + WARN_ONCE(true, "verifier bug: infinite loop in get_loop_entry\n"); + verbose(env, "verifier bug: infinite loop in get_loop_entry()\n"); + return ERR_PTR(-EFAULT); + } topmost = topmost->loop_entry; - /* Update loop entries for intermediate states to avoid this - * traversal in future get_loop_entry() calls. - */ - while (st && st->loop_entry != topmost) { - old = st->loop_entry; - st->loop_entry = topmost; - st = old; } return topmost; } -static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) +static void update_loop_entry(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) { - struct bpf_verifier_state *cur1, *hdr1; - - cur1 = get_loop_entry(cur) ?: cur; - hdr1 = get_loop_entry(hdr) ?: hdr; - /* The head1->branches check decides between cases B and C in - * comment for get_loop_entry(). If hdr1->branches == 0 then + /* The hdr->branches check decides between cases B and C in + * comment for get_loop_entry(). If hdr->branches == 0 then * head's topmost loop entry is not in current DFS path, * hence 'cur' and 'hdr' are not in the same loop and there is * no need to update cur->loop_entry. */ - if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) { + if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) { + if (cur->loop_entry) { + cur->loop_entry->used_as_loop_entry--; + maybe_free_verifier_state(env, state_loop_entry_as_list(cur)); + } cur->loop_entry = hdr; - hdr->used_as_loop_entry = true; + hdr->used_as_loop_entry++; } } static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { + struct bpf_verifier_state_list *sl = NULL, *parent_sl; + struct bpf_verifier_state *parent; + while (st) { u32 br = --st->branches; @@ -1869,7 +1966,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi * This is a part of 'case A' in get_loop_entry() comment. */ if (br == 0 && st->parent && st->loop_entry) - update_loop_entry(st->parent, st->loop_entry); + update_loop_entry(env, st->parent, st->loop_entry); /* WARN_ON(br > 1) technically makes sense here, * but see comment in push_stack(), hence: @@ -1879,7 +1976,12 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi br); if (br) break; - st = st->parent; + parent = st->parent; + parent_sl = state_parent_as_list(st); + if (sl) + maybe_free_verifier_state(env, sl); + st = parent; + sl = parent_sl; } } @@ -3204,6 +3306,21 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, return res ? &res->func_model : NULL; } +static int add_kfunc_in_insns(struct bpf_verifier_env *env, + struct bpf_insn *insn, int cnt) +{ + int i, ret; + + for (i = 0; i < cnt; i++, insn++) { + if (bpf_pseudo_kfunc_call(insn)) { + ret = add_kfunc_call(env, insn->imm, insn->off); + if (ret < 0) + return ret; + } + } + return 0; +} + static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprog = env->subprog_info; @@ -3267,6 +3384,15 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return 0; } +static int jmp_offset(struct bpf_insn *insn) +{ + u8 code = insn->code; + + if (code == (BPF_JMP32 | BPF_JA)) + return insn->imm; + return insn->off; +} + static int check_subprogs(struct bpf_verifier_env *env) { int i, subprog_start, subprog_end, off, cur_subprog = 0; @@ -3293,10 +3419,7 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - if (code == (BPF_JMP32 | BPF_JA)) - off = i + insn[i].imm + 1; - else - off = i + insn[i].off + 1; + off = i + jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -3481,7 +3604,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (class == BPF_STX) { - /* BPF_STX (including atomic variants) has multiple source + /* BPF_STX (including atomic variants) has one or more source * operands, one of which is a ptr. Check whether the caller is * asking about it. */ @@ -3826,6 +3949,17 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) return btf_name_by_offset(desc_btf, func->name_off); } +static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, + .cb_print = verbose, + .private_data = env, + }; + + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); +} + static inline void bt_init(struct backtrack_state *bt, u32 frame) { bt->frame = frame; @@ -4026,11 +4160,6 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; struct bpf_insn *insn = env->prog->insnsi + idx; u8 class = BPF_CLASS(insn->code); u8 opcode = BPF_OP(insn->code); @@ -4048,7 +4177,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + verbose_insn(env, insn); } /* If there is a history record that some registers gained range at this insn, @@ -4095,7 +4224,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * dreg still needs precision before this insn */ } - } else if (class == BPF_LDX) { + } else if (class == BPF_LDX || is_atomic_load_insn(insn)) { if (!bt_is_reg_set(bt, dreg)) return 0; bt_clear_reg(bt, dreg); @@ -5980,18 +6109,10 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, - enum bpf_access_type t, enum bpf_reg_type *reg_type, - struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx) + enum bpf_access_type t, struct bpf_insn_access_aux *info) { - struct bpf_insn_access_aux info = { - .reg_type = *reg_type, - .log = &env->log, - .is_retval = false, - .is_ldsx = is_ldsx, - }; - if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, env->prog, &info)) { + env->ops->is_valid_access(off, size, t, env->prog, info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -5999,14 +6120,15 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * will only allow for whole field access and rejects any other * type of narrower access. */ - *reg_type = info.reg_type; - *is_retval = info.is_retval; - - if (base_type(*reg_type) == PTR_TO_BTF_ID) { - *btf = info.btf; - *btf_id = info.btf_id; + if (base_type(info->reg_type) == PTR_TO_BTF_ID) { + if (info->ref_obj_id && + !find_reference_state(env->cur_state, info->ref_obj_id)) { + verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n", + off); + return -EACCES; + } } else { - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size; } /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) @@ -6116,6 +6238,26 @@ static bool is_arena_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_ARENA; } +/* Return false if @regno contains a pointer whose type isn't supported for + * atomic instruction @insn. + */ +static bool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno, + struct bpf_insn *insn) +{ + if (is_ctx_reg(env, regno)) + return false; + if (is_pkt_reg(env, regno)) + return false; + if (is_flow_key_reg(env, regno)) + return false; + if (is_sk_reg(env, regno)) + return false; + if (is_arena_reg(env, regno)) + return bpf_jit_supports_insn(insn, true); + + return true; +} + static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { #ifdef CONFIG_NET [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], @@ -7363,11 +7505,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { - bool is_retval = false; struct bpf_retval_range range; - enum bpf_reg_type reg_type = SCALAR_VALUE; - struct btf *btf = NULL; - u32 btf_id = 0; + struct bpf_insn_access_aux info = { + .reg_type = SCALAR_VALUE, + .is_ldsx = is_ldsx, + .log = &env->log, + }; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { @@ -7379,8 +7522,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, - &btf_id, &is_retval, is_ldsx); + err = check_ctx_access(env, insn_idx, off, size, t, &info); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -7388,8 +7530,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero. */ - if (reg_type == SCALAR_VALUE) { - if (is_retval && get_func_retval_range(env->prog, &range)) { + if (info.reg_type == SCALAR_VALUE) { + if (info.is_retval && get_func_retval_range(env->prog, &range)) { err = __mark_reg_s32_range(env, regs, value_regno, range.minval, range.maxval); if (err) @@ -7400,7 +7542,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (type_may_be_null(reg_type)) + if (type_may_be_null(info.reg_type)) regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the @@ -7408,12 +7550,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (base_type(reg_type) == PTR_TO_BTF_ID) { - regs[value_regno].btf = btf; - regs[value_regno].btf_id = btf_id; + if (base_type(info.reg_type) == PTR_TO_BTF_ID) { + regs[value_regno].btf = info.btf; + regs[value_regno].btf_id = info.btf_id; + regs[value_regno].ref_obj_id = info.ref_obj_id; } } - regs[value_regno].type = reg_type; + regs[value_regno].type = info.reg_type; } } else if (reg->type == PTR_TO_STACK) { @@ -7516,27 +7659,72 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, bool allow_trust_mismatch); -static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once, bool is_ldsx, + bool allow_trust_mismatch, const char *ctx) { - int load_reg; + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type src_reg_type; int err; - switch (insn->imm) { - case BPF_ADD: - case BPF_ADD | BPF_FETCH: - case BPF_AND: - case BPF_AND | BPF_FETCH: - case BPF_OR: - case BPF_OR | BPF_FETCH: - case BPF_XOR: - case BPF_XOR | BPF_FETCH: - case BPF_XCHG: - case BPF_CMPXCHG: - break; - default: - verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); - return -EINVAL; - } + /* check src operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check dst operand */ + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + src_reg_type = regs[insn->src_reg].type; + + /* Check if (src_reg + off) is readable. The state of dst_reg will be + * updated by this call. + */ + err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, + strict_alignment_once, is_ldsx); + err = err ?: save_aux_ptr_type(env, src_reg_type, + allow_trust_mismatch); + err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], ctx); + + return err; +} + +static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once) +{ + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type dst_reg_type; + int err; + + /* check src1 operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; + + dst_reg_type = regs[insn->dst_reg].type; + + /* Check if (dst_reg + off) is writeable. */ + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, + strict_alignment_once, false); + err = err ?: save_aux_ptr_type(env, dst_reg_type, false); + + return err; +} + +static int check_atomic_rmw(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int load_reg; + int err; if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { verbose(env, "invalid atomic operand size\n"); @@ -7572,11 +7760,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return -EACCES; } - if (is_ctx_reg(env, insn->dst_reg) || - is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg) || - is_sk_reg(env, insn->dst_reg) || - (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) { + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); @@ -7603,12 +7787,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, load_reg, - true, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, load_reg, true, false); if (err) return err; @@ -7618,13 +7802,86 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; return 0; } +static int check_atomic_load(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int err; + + err = check_load_mem(env, insn, true, false, false, "atomic_load"); + if (err) + return err; + + if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) { + verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n", + insn->src_reg, + reg_type_str(env, reg_state(env, insn->src_reg)->type)); + return -EACCES; + } + + return 0; +} + +static int check_atomic_store(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int err; + + err = check_store_reg(env, insn, true); + if (err) + return err; + + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", + insn->dst_reg, + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); + return -EACCES; + } + + return 0; +} + +static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: + return check_atomic_rmw(env, insn); + case BPF_LOAD_ACQ: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit load-acquires are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_load(env, insn); + case BPF_STORE_REL: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit store-releases are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_store(env, insn); + default: + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", + insn->imm); + return -EINVAL; + } +} + /* When register 'regno' is used to read the stack (either directly or through * a helper function) make sure that it's within stack boundary and, depending * on the access type and privileges, that all elements of the stack are @@ -7983,6 +8240,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg return err; } +enum { + PROCESS_SPIN_LOCK = (1 << 0), + PROCESS_RES_LOCK = (1 << 1), + PROCESS_LOCK_IRQ = (1 << 2), +}; + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL. @@ -8005,30 +8268,33 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ -static int process_spin_lock(struct bpf_verifier_env *env, int regno, - bool is_lock) +static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) { + bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK; + const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin"; struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); + bool is_irq = flags & PROCESS_LOCK_IRQ; u64 val = reg->var_off.value; struct bpf_map *map = NULL; struct btf *btf = NULL; struct btf_record *rec; + u32 spin_lock_off; int err; if (!is_const) { verbose(env, - "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n", + regno, lock_str); return -EINVAL; } if (reg->type == PTR_TO_MAP_VALUE) { map = reg->map_ptr; if (!map->btf) { verbose(env, - "map '%s' has to have BTF in order to use bpf_spin_lock\n", - map->name); + "map '%s' has to have BTF in order to use %s_lock\n", + map->name, lock_str); return -EINVAL; } } else { @@ -8036,36 +8302,53 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, } rec = reg_btf_record(reg); - if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) { - verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local", - map ? map->name : "kptr"); + if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) { + verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local", + map ? map->name : "kptr", lock_str); return -EINVAL; } - if (rec->spin_lock_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", - val + reg->off, rec->spin_lock_off); + spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off; + if (spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n", + val + reg->off, lock_str, spin_lock_off); return -EINVAL; } if (is_lock) { void *ptr; + int type; if (map) ptr = map; else ptr = btf; - if (cur->active_locks) { - verbose(env, - "Locking two bpf_spin_locks are not allowed\n"); - return -EINVAL; + if (!is_res_lock && cur->active_locks) { + if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) { + verbose(env, + "Locking two bpf_spin_locks are not allowed\n"); + return -EINVAL; + } + } else if (is_res_lock && cur->active_locks) { + if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) { + verbose(env, "Acquiring the same lock again, AA deadlock detected\n"); + return -EINVAL; + } } - err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr); + + if (is_res_lock && is_irq) + type = REF_TYPE_RES_LOCK_IRQ; + else if (is_res_lock) + type = REF_TYPE_RES_LOCK; + else + type = REF_TYPE_LOCK; + err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr); if (err < 0) { verbose(env, "Failed to acquire lock state\n"); return err; } } else { void *ptr; + int type; if (map) ptr = map; @@ -8073,12 +8356,26 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, ptr = btf; if (!cur->active_locks) { - verbose(env, "bpf_spin_unlock without taking a lock\n"); + verbose(env, "%s_unlock without taking a lock\n", lock_str); return -EINVAL; } - if (release_lock_state(env->cur_state, REF_TYPE_LOCK, reg->id, ptr)) { - verbose(env, "bpf_spin_unlock of different lock\n"); + if (is_res_lock && is_irq) + type = REF_TYPE_RES_LOCK_IRQ; + else if (is_res_lock) + type = REF_TYPE_RES_LOCK; + else + type = REF_TYPE_LOCK; + if (!find_lock_state(cur, type, reg->id, ptr)) { + verbose(env, "%s_unlock of different lock\n", lock_str); + return -EINVAL; + } + if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) { + verbose(env, "%s_unlock cannot be out of order\n", lock_str); + return -EINVAL; + } + if (release_lock_state(cur, type, reg->id, ptr)) { + verbose(env, "%s_unlock of different lock\n", lock_str); return -EINVAL; } @@ -8429,10 +8726,12 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env, { struct bpf_verifier_state_list *sl; struct bpf_verifier_state *st; + struct list_head *pos, *head; /* Explored states are pushed in stack order, most recent states come first */ - sl = *explored_state(env, insn_idx); - for (; sl; sl = sl->next) { + head = explored_state(env, insn_idx); + list_for_each(pos, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); /* If st->branches != 0 state is a part of current DFS verification path, * hence cur & st for a loop. */ @@ -9149,10 +9448,11 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return 0; } -/* Returns constant key value if possible, else negative error */ -static s64 get_constant_map_key(struct bpf_verifier_env *env, +/* Returns constant key value in `value` if possible, else negative error */ +static int get_constant_map_key(struct bpf_verifier_env *env, struct bpf_reg_state *key, - u32 key_size) + u32 key_size, + s64 *value) { struct bpf_func_state *state = func(env, key); struct bpf_reg_state *reg; @@ -9179,8 +9479,10 @@ static s64 get_constant_map_key(struct bpf_verifier_env *env, /* First handle precisely tracked STACK_ZERO */ for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--) zero_size++; - if (zero_size >= key_size) + if (zero_size >= key_size) { + *value = 0; return 0; + } /* Check that stack contains a scalar spill of expected size */ if (!is_spilled_scalar_reg(&state->stack[spi])) @@ -9203,9 +9505,12 @@ static s64 get_constant_map_key(struct bpf_verifier_env *env, if (err < 0) return err; - return reg->var_off.value; + *value = reg->var_off.value; + return 0; } +static bool can_elide_value_nullness(enum bpf_map_type type); + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn, @@ -9354,9 +9659,16 @@ skip_type_check: err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); if (err) return err; - meta->const_map_key = get_constant_map_key(env, reg, key_size); - if (meta->const_map_key < 0 && meta->const_map_key != -EOPNOTSUPP) - return meta->const_map_key; + if (can_elide_value_nullness(meta->map_ptr->map_type)) { + err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); + if (err < 0) { + meta->const_map_key = -1; + if (err == -EOPNOTSUPP) + err = 0; + else + return err; + } + } break; case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) @@ -9389,11 +9701,11 @@ skip_type_check: return -EACCES; } if (meta->func_id == BPF_FUNC_spin_lock) { - err = process_spin_lock(env, regno, true); + err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK); if (err) return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - err = process_spin_lock(env, regno, false); + err = process_spin_lock(env, regno, 0); if (err) return err; } else { @@ -9651,7 +9963,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) { - verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n"); + verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n"); return -EINVAL; } break; @@ -10222,23 +10534,18 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (subprog_is_global(env, subprog)) { const char *sub_name = subprog_name(env, subprog); - /* Only global subprogs cannot be called with a lock held. */ if (env->cur_state->active_locks) { verbose(env, "global function calls are not allowed while holding a lock,\n" "use static function instead\n"); return -EINVAL; } - /* Only global subprogs cannot be called with preemption disabled. */ - if (env->cur_state->active_preempt_locks) { - verbose(env, "global function calls are not allowed with preemption disabled,\n" - "use static function instead\n"); - return -EINVAL; - } - - if (env->cur_state->active_irq_id) { - verbose(env, "global function calls are not allowed with IRQs disabled,\n" - "use static function instead\n"); + if (env->subprog_info[subprog].might_sleep && + (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks || + env->cur_state->active_irq_id || !in_sleepable(env))) { + verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" + "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" + "a non-sleepable BPF program context\n"); return -EINVAL; } @@ -10737,6 +11044,8 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit) { struct bpf_verifier_state *state = env->cur_state; + enum bpf_prog_type type = resolve_prog_type(env->prog); + struct bpf_reg_state *reg = reg_state(env, BPF_REG_0); bool refs_lingering = false; int i; @@ -10746,6 +11055,12 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].type != REF_TYPE_PTR) continue; + /* Allow struct_ops programs to return a referenced kptr back to + * kernel. Type checks are performed later in check_return_code. + */ + if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit && + reg->ref_obj_id == state->refs[i].id) + continue; verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", state->refs[i].id, state->refs[i].insn_idx); refs_lingering = true; @@ -11272,7 +11587,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].map_uid = meta.map_uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_flag) && - btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { + btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } break; @@ -11444,10 +11759,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* mark_btf_func_reg_size() is used when the reg size is determined by * the BTF func_proto's return value size and argument. */ -static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, - size_t reg_size) +static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs, + u32 regno, size_t reg_size) { - struct bpf_reg_state *reg = &cur_regs(env)[regno]; + struct bpf_reg_state *reg = ®s[regno]; if (regno == BPF_REG_0) { /* Function return value */ @@ -11465,6 +11780,12 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } +static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, + size_t reg_size) +{ + return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size); +} + static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) { return meta->kfunc_flags & KF_ACQUIRE; @@ -11602,6 +11923,7 @@ enum { KF_ARG_RB_ROOT_ID, KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, + KF_ARG_RES_SPIN_LOCK_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -11611,6 +11933,7 @@ BTF_ID(struct, bpf_list_node) BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) +BTF_ID(struct, bpf_res_spin_lock) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -11659,6 +11982,11 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); +} + static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, const struct btf_param *arg) { @@ -11730,6 +12058,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_MAP, KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, + KF_ARG_PTR_TO_RES_SPIN_LOCK, }; enum special_kfunc_type { @@ -11766,6 +12095,12 @@ enum special_kfunc_type { KF_bpf_iter_num_new, KF_bpf_iter_num_next, KF_bpf_iter_num_destroy, + KF_bpf_set_dentry_xattr, + KF_bpf_remove_dentry_xattr, + KF_bpf_res_spin_lock, + KF_bpf_res_spin_unlock, + KF_bpf_res_spin_lock_irqsave, + KF_bpf_res_spin_unlock_irqrestore, }; BTF_SET_START(special_kfunc_set) @@ -11795,6 +12130,10 @@ BTF_ID(func, bpf_wq_set_callback_impl) #ifdef CONFIG_CGROUPS BTF_ID(func, bpf_iter_css_task_new) #endif +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_set_dentry_xattr) +BTF_ID(func, bpf_remove_dentry_xattr) +#endif BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -11844,6 +12183,17 @@ BTF_ID(func, bpf_local_irq_restore) BTF_ID(func, bpf_iter_num_new) BTF_ID(func, bpf_iter_num_next) BTF_ID(func, bpf_iter_num_destroy) +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_set_dentry_xattr) +BTF_ID(func, bpf_remove_dentry_xattr) +#else +BTF_ID_UNUSED +BTF_ID_UNUSED +#endif +BTF_ID(func, bpf_res_spin_lock) +BTF_ID(func, bpf_res_spin_unlock) +BTF_ID(func, bpf_res_spin_lock_irqsave) +BTF_ID(func, bpf_res_spin_unlock_irqrestore) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -11937,6 +12287,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; + if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_RES_SPIN_LOCK; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -12044,13 +12397,19 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, struct bpf_kfunc_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + int err, kfunc_class = IRQ_NATIVE_KFUNC; bool irq_save; - int err; - if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) { + if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) { irq_save = true; - } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) { + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) + kfunc_class = IRQ_LOCK_KFUNC; + } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) { irq_save = false; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) + kfunc_class = IRQ_LOCK_KFUNC; } else { verbose(env, "verifier internal error: unknown irq flags kfunc\n"); return -EFAULT; @@ -12066,7 +12425,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (err) return err; - err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx); + err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class); if (err) return err; } else { @@ -12080,7 +12439,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (err) return err; - err = unmark_stack_slot_irq_flag(env, reg); + err = unmark_stack_slot_irq_flag(env, reg, kfunc_class); if (err) return err; } @@ -12207,7 +12566,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ if (!env->cur_state->active_locks) return -EINVAL; - s = find_lock_state(env->cur_state, REF_TYPE_LOCK, id, ptr); + s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr); if (!s) { verbose(env, "held lock and object are not in the same allocation\n"); return -EINVAL; @@ -12243,9 +12602,18 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; } +static bool is_bpf_res_spin_lock_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] || + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] || + btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || + btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]; +} + static bool kfunc_spin_allowed(u32 btf_id) { - return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id); + return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) || + is_bpf_res_spin_lock_kfunc(btf_id); } static bool is_sync_callback_calling_kfunc(u32 btf_id) @@ -12677,6 +13045,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: case KF_ARG_PTR_TO_IRQ_FLAG: + case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; default: WARN_ON_ONCE(1); @@ -12975,6 +13344,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_RES_SPIN_LOCK: + { + int flags = PROCESS_RES_LOCK; + + if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d doesn't point to map value or allocated object\n", i); + return -EINVAL; + } + + if (!is_bpf_res_spin_lock_kfunc(meta->func_id)) + return -EFAULT; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) + flags |= PROCESS_SPIN_LOCK; + if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] || + meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) + flags |= PROCESS_LOCK_IRQ; + ret = process_spin_lock(env, regno, flags); + if (ret < 0) + return ret; + break; + } } } @@ -13060,6 +13451,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, insn_aux->is_iter_next = is_iter_next_kfunc(&meta); + if (!insn->off && + (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] || + insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) { + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + + branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false); + if (!branch) { + verbose(env, "failed to push state for failed lock acquisition\n"); + return -ENOMEM; + } + + regs = branch->frame[branch->curframe]->regs; + + /* Clear r0-r5 registers in forked state */ + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(env, regs, caller_saved[i]); + + mark_reg_unknown(env, regs, BPF_REG_0); + err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1); + if (err) { + verbose(env, "failed to mark s32 range for retval in forked state for lock\n"); + return err; + } + __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32)); + } + if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); return -EACCES; @@ -13230,6 +13648,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (btf_type_is_scalar(t)) { mark_reg_unknown(env, regs, BPF_REG_0); + if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] || + meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) + __mark_reg_const_zero(env, ®s[BPF_REG_0]); mark_btf_func_reg_size(env, BPF_REG_0, t->size); } else if (btf_type_is_ptr(t)) { ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); @@ -16384,13 +16805,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char const char *exit_ctx = "At program exit"; struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; - struct bpf_reg_state *reg; + struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_retval_range range = retval_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; struct bpf_func_state *frame = env->cur_state->frame[0]; const bool is_subprog = frame->subprogno; bool return_32bit = false; + const struct btf_type *reg_type, *ret_type = NULL; /* LSM and struct_ops func-ptr's return type could be "void" */ if (!is_subprog || frame->in_exception_callback_fn) { @@ -16399,10 +16821,26 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char if (prog->expected_attach_type == BPF_LSM_CGROUP) /* See below, can be 0 or 0-1 depending on hook. */ break; - fallthrough; + if (!prog->aux->attach_func_proto->type) + return 0; + break; case BPF_PROG_TYPE_STRUCT_OPS: if (!prog->aux->attach_func_proto->type) return 0; + + if (frame->in_exception_callback_fn) + break; + + /* Allow a struct_ops program to return a referenced kptr if it + * matches the operator's return type and is in its unmodified + * form. A scalar zero (i.e., a null pointer) is also allowed. + */ + reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL; + ret_type = btf_type_resolve_ptr(prog->aux->attach_btf, + prog->aux->attach_func_proto->type, + NULL); + if (ret_type && ret_type == reg_type && reg->ref_obj_id) + return __check_ptr_off_reg(env, reg, regno, false); break; default: break; @@ -16424,8 +16862,6 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return -EACCES; } - reg = cur_regs(env) + regno; - if (frame->in_async_callback_fn) { /* enforce return zero from async callbacks like timer */ exit_ctx = "At async callback return"; @@ -16524,6 +16960,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char case BPF_PROG_TYPE_NETFILTER: range = retval_range(NF_DROP, NF_ACCEPT); break; + case BPF_PROG_TYPE_STRUCT_OPS: + if (!ret_type) + return 0; + range = retval_range(0, 0); + break; case BPF_PROG_TYPE_EXT: /* freplace program can return anything as its return value * depends on the to-be-replaced kernel func or bpf program. @@ -16567,6 +17008,14 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) subprog->changes_pkt_data = true; } +static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = find_containing_subprog(env, off); + subprog->might_sleep = true; +} + /* 't' is an index of a call-site. * 'w' is a callee entry point. * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. @@ -16580,6 +17029,7 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) caller = find_containing_subprog(env, t); callee = find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; + caller->might_sleep |= callee->might_sleep; } /* non-recursive DFS pseudo code @@ -16738,27 +17188,6 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, /* Bitmask with 1s for all caller saved registers */ #define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) -/* Return a bitmask specifying which caller saved registers are - * clobbered by a call to a helper *as if* this helper follows - * bpf_fastcall contract: - * - includes R0 if function is non-void; - * - includes R1-R5 if corresponding parameter has is described - * in the function prototype. - */ -static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn) -{ - u32 mask; - int i; - - mask = 0; - if (fn->ret_type != RET_VOID) - mask |= BIT(BPF_REG_0); - for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) - if (fn->arg_type[i] != ARG_DONTCARE) - mask |= BIT(BPF_REG_1 + i); - return mask; -} - /* True if do_misc_fixups() replaces calls to helper number 'imm', * replacement patch is presumed to follow bpf_fastcall contract * (see mark_fastcall_pattern_for_call() below). @@ -16775,24 +17204,54 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) } } -/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */ -static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) +struct call_summary { + u8 num_params; + bool is_void; + bool fastcall; +}; + +/* If @call is a kfunc or helper call, fills @cs and returns true, + * otherwise returns false. + */ +static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call, + struct call_summary *cs) { - u32 vlen, i, mask; + struct bpf_kfunc_call_arg_meta meta; + const struct bpf_func_proto *fn; + int i; - vlen = btf_type_vlen(meta->func_proto); - mask = 0; - if (!btf_type_is_void(btf_type_by_id(meta->btf, meta->func_proto->type))) - mask |= BIT(BPF_REG_0); - for (i = 0; i < vlen; ++i) - mask |= BIT(BPF_REG_1 + i); - return mask; -} + if (bpf_helper_call(call)) { -/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ -static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_FASTCALL; + if (get_helper_proto(env, call->imm, &fn) < 0) + /* error would be reported later */ + return false; + cs->fastcall = fn->allow_fastcall && + (verifier_inlines_helper_call(env, call->imm) || + bpf_jit_inlines_helper_call(call->imm)); + cs->is_void = fn->ret_type == RET_VOID; + cs->num_params = 0; + for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) { + if (fn->arg_type[i] == ARG_DONTCARE) + break; + cs->num_params++; + } + return true; + } + + if (bpf_pseudo_kfunc_call(call)) { + int err; + + err = fetch_kfunc_meta(env, call, &meta, NULL); + if (err < 0) + /* error would be reported later */ + return false; + cs->num_params = btf_type_vlen(meta.func_proto); + cs->fastcall = meta.kfunc_flags & KF_FASTCALL; + cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type)); + return true; + } + + return false; } /* LLVM define a bpf_fastcall function attribute. @@ -16875,39 +17334,23 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, { struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; struct bpf_insn *call = &env->prog->insnsi[insn_idx]; - const struct bpf_func_proto *fn; - u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS; + u32 clobbered_regs_mask; + struct call_summary cs; u32 expected_regs_mask; - bool can_be_inlined = false; s16 off; int i; - if (bpf_helper_call(call)) { - if (get_helper_proto(env, call->imm, &fn) < 0) - /* error would be reported later */ - return; - clobbered_regs_mask = helper_fastcall_clobber_mask(fn); - can_be_inlined = fn->allow_fastcall && - (verifier_inlines_helper_call(env, call->imm) || - bpf_jit_inlines_helper_call(call->imm)); - } - - if (bpf_pseudo_kfunc_call(call)) { - struct bpf_kfunc_call_arg_meta meta; - int err; - - err = fetch_kfunc_meta(env, call, &meta, NULL); - if (err < 0) - /* error would be reported later */ - return; - - clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta); - can_be_inlined = is_fastcall_kfunc_call(&meta); - } - - if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS) + if (!get_call_summary(env, call, &cs)) return; + /* A bitmask specifying which caller saved registers are clobbered + * by a call to a helper/kfunc *as if* this helper/kfunc follows + * bpf_fastcall contract: + * - includes R0 if function is non-void; + * - includes R1-R5 if corresponding parameter has is described + * in the function prototype. + */ + clobbered_regs_mask = GENMASK(cs.num_params, cs.is_void ? 1 : 0); /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */ expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS; @@ -16965,7 +17408,7 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills() * does not remove spill/fill pair {4,6}. */ - if (can_be_inlined) + if (cs.fastcall) env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1; else subprog->keep_fastcall_stack = 1; @@ -17047,9 +17490,20 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_prune_point(env, t); mark_jmp_point(env, t); } - if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm)) - mark_subprog_changes_pkt_data(env, t); - if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + if (bpf_helper_call(insn)) { + const struct bpf_func_proto *fp; + + ret = get_helper_proto(env, insn->imm, &fp); + /* If called in a non-sleepable context program will be + * rejected anyway, so we should end up with precise + * sleepable marks on subprogs, except for dead code + * elimination. + */ + if (ret == 0 && fp->might_sleep) + mark_subprog_might_sleep(env, t); + if (bpf_helper_changes_pkt_data(insn->imm)) + mark_subprog_changes_pkt_data(env, t); + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; ret = fetch_kfunc_meta(env, insn, &meta, NULL); @@ -17068,6 +17522,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env) */ mark_force_checkpoint(env, t); } + /* Same as helpers, if called in a non-sleepable context + * program will be rejected anyway, so we should end up + * with precise sleepable marks on subprogs, except for + * dead code elimination. + */ + if (ret == 0 && is_kfunc_sleepable(&meta)) + mark_subprog_might_sleep(env, t); } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); @@ -17110,9 +17571,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; - int *insn_stack, *insn_state; + int *insn_stack, *insn_state, *insn_postorder; int ex_insn_beg, i, ret = 0; - bool ex_done = false; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) @@ -17124,6 +17584,17 @@ static int check_cfg(struct bpf_verifier_env *env) return -ENOMEM; } + insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_postorder) { + kvfree(insn_state); + kvfree(insn_stack); + return -ENOMEM; + } + + ex_insn_beg = env->exception_callback_subprog + ? env->subprog_info[env->exception_callback_subprog].start + : 0; + insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ insn_stack[0] = 0; /* 0 is the first instruction */ env->cfg.cur_stack = 1; @@ -17137,6 +17608,7 @@ walk_cfg: case DONE_EXPLORING: insn_state[t] = EXPLORED; env->cfg.cur_stack--; + insn_postorder[env->cfg.cur_postorder++] = t; break; case KEEP_EXPLORING: break; @@ -17155,13 +17627,10 @@ walk_cfg: goto err_free; } - if (env->exception_callback_subprog && !ex_done) { - ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start; - + if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) { insn_state[ex_insn_beg] = DISCOVERED; insn_stack[0] = ex_insn_beg; env->cfg.cur_stack = 1; - ex_done = true; goto walk_cfg; } @@ -17184,6 +17653,7 @@ walk_cfg: } ret = 0; /* cfg looks good */ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; + env->prog->aux->might_sleep = env->subprog_info[0].might_sleep; err_free: kvfree(insn_state); @@ -17800,18 +18270,22 @@ static void clean_verifier_state(struct bpf_verifier_env *env, static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { + struct bpf_verifier_state *loop_entry; struct bpf_verifier_state_list *sl; + struct list_head *pos, *head; - sl = *explored_state(env, insn); - while (sl) { + head = explored_state(env, insn); + list_for_each(pos, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); if (sl->state.branches) - goto next; + continue; + loop_entry = get_loop_entry(env, &sl->state); + if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches) + continue; if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) - goto next; + continue; clean_verifier_state(env, &sl->state); -next: - sl = sl->next; } } @@ -18112,7 +18586,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, case STACK_IRQ_FLAG: old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; - if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) || + old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class) return false; break; case STACK_MISC: @@ -18147,6 +18622,10 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) return false; + if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) || + old->active_lock_ptr != cur->active_lock_ptr) + return false; + for (i = 0; i < old->acquired_refs; i++) { if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || old->refs[i].type != cur->refs[i].type) @@ -18156,6 +18635,8 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c case REF_TYPE_IRQ: break; case REF_TYPE_LOCK: + case REF_TYPE_RES_LOCK: + case REF_TYPE_RES_LOCK_IRQ: if (old->refs[i].ptr != cur->refs[i].ptr) return false; break; @@ -18195,15 +18676,17 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c * the current state will reach 'bpf_exit' instruction safely */ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, enum exact_level exact) + struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact) { - int i; + u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before; + u16 i; if (old->callback_depth > cur->callback_depth) return false; for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(env, &old->regs[i], &cur->regs[i], + if (((1 << i) & live_regs) && + !regsafe(env, &old->regs[i], &cur->regs[i], &env->idmap_scratch, exact)) return false; @@ -18224,6 +18707,7 @@ static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, enum exact_level exact) { + u32 insn_idx; int i; if (old->curframe != cur->curframe) @@ -18247,9 +18731,12 @@ static bool states_equal(struct bpf_verifier_env *env, * and all frame states need to be equivalent */ for (i = 0; i <= old->curframe; i++) { + insn_idx = i == old->curframe + ? env->insn_idx + : old->frame[i + 1]->callsite; if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(env, old->frame[i], cur->frame[i], exact)) + if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact)) return false; } return true; @@ -18502,10 +18989,11 @@ static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; - struct bpf_verifier_state_list *sl, **pprev; + struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; int i, j, n, err, states_cnt = 0; bool force_new_state, add_new_state, force_exact; + struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || /* Avoid accumulating infinitely long jmp history */ @@ -18524,15 +19012,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) env->insn_processed - env->prev_insn_processed >= 8) add_new_state = true; - pprev = explored_state(env, insn_idx); - sl = *pprev; - clean_live_states(env, insn_idx, cur); - while (sl) { + head = explored_state(env, insn_idx); + list_for_each_safe(pos, tmp, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); states_cnt++; if (sl->state.insn_idx != insn_idx) - goto next; + continue; if (sl->state.branches) { struct bpf_func_state *frame = sl->state.frame[sl->state.curframe]; @@ -18606,7 +19093,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) spi = __get_spi(iter_reg->off + iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { - update_loop_entry(cur, &sl->state); + update_loop_entry(env, cur, &sl->state); goto hit; } } @@ -18615,7 +19102,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) if (is_may_goto_insn_at(env, insn_idx)) { if (sl->state.may_goto_depth != cur->may_goto_depth && states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - update_loop_entry(cur, &sl->state); + update_loop_entry(env, cur, &sl->state); goto hit; } } @@ -18682,11 +19169,13 @@ skip_inf_loop_check: * * Additional details are in the comment before get_loop_entry(). */ - loop_entry = get_loop_entry(&sl->state); + loop_entry = get_loop_entry(env, &sl->state); + if (IS_ERR(loop_entry)) + return PTR_ERR(loop_entry); force_exact = loop_entry && loop_entry->branches > 0; if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) { if (force_exact) - update_loop_entry(cur, loop_entry); + update_loop_entry(env, cur, loop_entry); hit: sl->hit_cnt++; /* reached equivalent register/stack state, @@ -18735,31 +19224,13 @@ miss: /* the state is unlikely to be useful. Remove it to * speed up verification */ - *pprev = sl->next; - if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE && - !sl->state.used_as_loop_entry) { - u32 br = sl->state.branches; - - WARN_ONCE(br, - "BUG live_done but branches_to_explore %d\n", - br); - free_verifier_state(&sl->state, false); - kfree(sl); - env->peak_states--; - } else { - /* cannot free this state, since parentage chain may - * walk it later. Add it for free_list instead to - * be freed at the end of verification - */ - sl->next = env->free_list; - env->free_list = sl; - } - sl = *pprev; - continue; + sl->in_free_list = true; + list_del(&sl->node); + list_add(&sl->node, &env->free_list); + env->free_list_size++; + env->explored_states_size--; + maybe_free_verifier_state(env, sl); } -next: - pprev = &sl->next; - sl = *pprev; } if (env->max_states_per_insn < states_cnt) @@ -18784,7 +19255,8 @@ next: if (!new_sl) return -ENOMEM; env->total_states++; - env->peak_states++; + env->explored_states_size++; + update_peak_states(env); env->prev_jmps_processed = env->jmps_processed; env->prev_insn_processed = env->insn_processed; @@ -18808,8 +19280,8 @@ next: cur->first_insn_idx = insn_idx; cur->insn_hist_start = cur->insn_hist_end; cur->dfs_depth = new->dfs_depth + 1; - new_sl->next = *explored_state(env, insn_idx); - *explored_state(env, insn_idx) = new_sl; + list_add(&new_sl->node, head); + /* connect new state to parentage chain. Current frame needs all * registers connected. Only r6 - r9 of the callers are alive (pushed * to the stack implicitly by JITs) so in callers' frames connect just @@ -18996,19 +19468,13 @@ static int do_check(struct bpf_verifier_env *env) } if (env->log.level & BPF_LOG_LEVEL) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; - if (verifier_state_scratched(env)) print_insn_state(env, state, state->curframe); verbose_linfo(env, env->insn_idx, "; "); env->prev_log_pos = env->log.end_pos; verbose(env, "%d: ", env->insn_idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + verbose_insn(env, insn); env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos; env->prev_log_pos = env->log.end_pos; } @@ -19030,37 +19496,18 @@ static int do_check(struct bpf_verifier_env *env) return err; } else if (class == BPF_LDX) { - enum bpf_reg_type src_reg_type; - - /* check for reserved fields is already done */ - - /* check src operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; + bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; - err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); - if (err) - return err; - - src_reg_type = regs[insn->src_reg].type; - - /* check that memory (src_reg + off) is readable, - * the state of dst_reg will be updated by this func + /* Check for reserved fields is already done in + * resolve_pseudo_ldimm64(). */ - err = check_mem_access(env, env->insn_idx, insn->src_reg, - insn->off, BPF_SIZE(insn->code), - BPF_READ, insn->dst_reg, false, - BPF_MODE(insn->code) == BPF_MEMSX); - err = err ?: save_aux_ptr_type(env, src_reg_type, true); - err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], "ldx"); + err = check_load_mem(env, insn, false, is_ldsx, true, + "ldx"); if (err) return err; } else if (class == BPF_STX) { - enum bpf_reg_type dst_reg_type; - if (BPF_MODE(insn->code) == BPF_ATOMIC) { - err = check_atomic(env, env->insn_idx, insn); + err = check_atomic(env, insn); if (err) return err; env->insn_idx++; @@ -19072,25 +19519,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - /* check src1 operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; - /* check src2 operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; - - dst_reg_type = regs[insn->dst_reg].type; - - /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, - insn->off, BPF_SIZE(insn->code), - BPF_WRITE, insn->src_reg, false, false); - if (err) - return err; - - err = save_aux_ptr_type(env, dst_reg_type, false); + err = check_store_reg(env, insn, false); if (err) return err; } else if (class == BPF_ST) { @@ -19230,6 +19659,10 @@ process_bpf_exit: return err; break; } else { + if (WARN_ON_ONCE(env->cur_state->loop_entry)) { + verbose(env, "verifier bug: env->cur_state->loop_entry != NULL\n"); + return -EFAULT; + } do_print_state = true; continue; } @@ -19489,7 +19922,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { + if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -20319,7 +20752,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) { struct bpf_subprog_info *subprogs = env->subprog_info; const struct bpf_verifier_ops *ops = env->ops; - int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0; + int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0; const int insn_cnt = env->prog->len; struct bpf_insn *epilogue_buf = env->epilogue_buf; struct bpf_insn *insn_buf = env->insn_buf; @@ -20348,6 +20781,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return -ENOMEM; env->prog = new_prog; delta += cnt - 1; + + ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1); + if (ret < 0) + return ret; } } @@ -20368,6 +20805,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) env->prog = new_prog; delta += cnt - 1; + + ret = add_kfunc_in_insns(env, insn_buf, cnt - 1); + if (ret < 0) + return ret; } } @@ -20400,7 +20841,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || + } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); @@ -20708,6 +21151,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; + func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; if (!i) func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); @@ -20924,6 +21368,14 @@ static void specialize_kfunc(struct bpf_verifier_env *env, */ env->seen_direct_write = seen_direct_write; } + + if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] && + bpf_lsm_has_d_inode_locked(prog)) + *addr = (unsigned long)bpf_set_dentry_xattr_locked; + + if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr] && + bpf_lsm_has_d_inode_locked(prog)) + *addr = (unsigned long)bpf_remove_dentry_xattr_locked; } static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux, @@ -21358,7 +21810,50 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } - if (is_may_goto_insn(insn)) { + if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) { + int stack_off_cnt = -stack_depth - 16; + + /* + * Two 8 byte slots, depth-16 stores the count, and + * depth-8 stores the start timestamp of the loop. + * + * The starting value of count is BPF_MAX_TIMED_LOOPS + * (0xffff). Every iteration loads it and subs it by 1, + * until the value becomes 0 in AX (thus, 1 in stack), + * after which we call arch_bpf_timed_may_goto, which + * either sets AX to 0xffff to keep looping, or to 0 + * upon timeout. AX is then stored into the stack. In + * the next iteration, we either see 0 and break out, or + * continue iterating until the next time value is 0 + * after subtraction, rinse and repeat. + */ + stack_depth_extra = 16; + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt); + if (insn->off >= 0) + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5); + else + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); + insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); + insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2); + /* + * AX is used as an argument to pass in stack_off_cnt + * (to add to r10/fp), and also as the return value of + * the call to arch_bpf_timed_may_goto. + */ + insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt); + insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto); + insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt); + cnt = 7; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } else if (is_may_goto_insn(insn)) { int stack_off = -stack_depth - 8; stack_depth_extra = 8; @@ -21687,12 +22182,12 @@ patch_map_ops_generic: if (insn->imm == BPF_FUNC_get_smp_processor_id && verifier_inlines_helper_call(env, insn->imm)) { /* BPF_FUNC_get_smp_processor_id inlining is an - * optimization, so if pcpu_hot.cpu_number is ever + * optimization, so if cpu_number is ever * changed in some incompatible and hard to support * way, it's fine to back out this inlining logic */ #ifdef CONFIG_SMP - insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number); insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); cnt = 3; @@ -21882,6 +22377,13 @@ next_insn: if (subprogs[cur_subprog + 1].start == i + delta + 1) { subprogs[cur_subprog].stack_depth += stack_depth_extra; subprogs[cur_subprog].stack_extra = stack_depth_extra; + + stack_depth = subprogs[cur_subprog].stack_depth; + if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) { + verbose(env, "stack size %d(extra %d) is too large\n", + stack_depth, stack_depth_extra); + return -EINVAL; + } cur_subprog++; stack_depth = subprogs[cur_subprog].stack_depth; stack_depth_extra = 0; @@ -21892,23 +22394,33 @@ next_insn: env->prog->aux->stack_depth = subprogs[0].stack_depth; for (i = 0; i < env->subprog_cnt; i++) { + int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1; int subprog_start = subprogs[i].start; int stack_slots = subprogs[i].stack_extra / 8; + int slots = delta, cnt = 0; if (!stack_slots) continue; - if (stack_slots > 1) { + /* We need two slots in case timed may_goto is supported. */ + if (stack_slots > slots) { verbose(env, "verifier bug: stack_slots supports may_goto only\n"); return -EFAULT; } - /* Add ST insn to subprog prologue to init extra stack */ - insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, - -subprogs[i].stack_depth, BPF_MAX_LOOPS); + stack_depth = subprogs[i].stack_depth; + if (bpf_jit_supports_timed_may_goto()) { + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_TIMED_LOOPS); + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0); + } else { + /* Add ST insn to subprog prologue to init extra stack */ + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_LOOPS); + } /* Copy first actual insn to preserve it */ - insn_buf[1] = env->prog->insnsi[subprog_start]; + insn_buf[cnt++] = env->prog->insnsi[subprog_start]; - new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2); + new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt); if (!new_prog) return -ENOMEM; env->prog = prog = new_prog; @@ -21918,7 +22430,7 @@ next_insn: * to insn after BPF_ST that inits may_goto count. * Adjustment will succeed because bpf_patch_insn_data() didn't fail. */ - WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1)); + WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta)); } /* Since poke tab is now finalized, publish aux to tracker. */ @@ -22116,31 +22628,29 @@ static int remove_fastcall_spills_fills(struct bpf_verifier_env *env) static void free_states(struct bpf_verifier_env *env) { - struct bpf_verifier_state_list *sl, *sln; + struct bpf_verifier_state_list *sl; + struct list_head *head, *pos, *tmp; int i; - sl = env->free_list; - while (sl) { - sln = sl->next; + list_for_each_safe(pos, tmp, &env->free_list) { + sl = container_of(pos, struct bpf_verifier_state_list, node); free_verifier_state(&sl->state, false); kfree(sl); - sl = sln; } - env->free_list = NULL; + INIT_LIST_HEAD(&env->free_list); if (!env->explored_states) return; for (i = 0; i < state_htab_size(env); i++) { - sl = env->explored_states[i]; + head = &env->explored_states[i]; - while (sl) { - sln = sl->next; + list_for_each_safe(pos, tmp, head) { + sl = container_of(pos, struct bpf_verifier_state_list, node); free_verifier_state(&sl->state, false); kfree(sl); - sl = sln; } - env->explored_states[i] = NULL; + INIT_LIST_HEAD(&env->explored_states[i]); } } @@ -22148,6 +22658,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_subprog_info *sub = subprog_info(env, subprog); + struct bpf_prog_aux *aux = env->prog->aux; struct bpf_verifier_state *state; struct bpf_reg_state *regs; int ret, i; @@ -22255,6 +22766,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, BPF_REG_1); } + /* Acquire references for struct_ops program arguments tagged with "__ref" */ + if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) { + for (i = 0; i < aux->ctx_arg_info_size; i++) + aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ? + acquire_reference(env, 0) : 0; + } + ret = do_check(env); out: /* check for NULL is necessary, since cur_state can be freed inside @@ -22377,6 +22895,15 @@ static void print_verification_stats(struct bpf_verifier_env *env) env->peak_states, env->longest_mark_read_walk); } +int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, + const struct bpf_ctx_arg_aux *info, u32 cnt) +{ + prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL); + prog->aux->ctx_arg_info_size = cnt; + + return prog->aux->ctx_arg_info ? 0 : -ENOMEM; +} + static int check_struct_ops_btf_id(struct bpf_verifier_env *env) { const struct btf_type *t, *func_proto; @@ -22384,10 +22911,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) const struct bpf_struct_ops *st_ops; const struct btf_member *member; struct bpf_prog *prog = env->prog; - u32 btf_id, member_idx; + bool has_refcounted_arg = false; + u32 btf_id, member_idx, member_off; struct btf *btf; const char *mname; - int err; + int i, err; if (!prog->gpl_compatible) { verbose(env, "struct ops programs must have a GPL compatible license\n"); @@ -22435,7 +22963,8 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EINVAL; } - err = bpf_struct_ops_supported(st_ops, __btf_member_bit_offset(t, member) / 8); + member_off = __btf_member_bit_offset(t, member) / 8; + err = bpf_struct_ops_supported(st_ops, member_off); if (err) { verbose(env, "attach to unsupported member %s of struct %s\n", mname, st_ops->name); @@ -22457,17 +22986,32 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) return -EACCES; } - /* btf_ctx_access() used this to provide argument type info */ - prog->aux->ctx_arg_info = - st_ops_desc->arg_info[member_idx].info; - prog->aux->ctx_arg_info_size = - st_ops_desc->arg_info[member_idx].cnt; + for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) { + if (st_ops_desc->arg_info[member_idx].info->refcounted) { + has_refcounted_arg = true; + break; + } + } + + /* Tail call is not allowed for programs with refcounted arguments since we + * cannot guarantee that valid refcounted kptrs will be passed to the callee. + */ + for (i = 0; i < env->subprog_cnt; i++) { + if (has_refcounted_arg && env->subprog_info[i].has_tail_call) { + verbose(env, "program with __ref argument cannot tail call\n"); + return -EINVAL; + } + } + + prog->aux->st_ops = st_ops; + prog->aux->attach_st_ops_member_off = member_off; prog->aux->attach_func_proto = func_proto; prog->aux->attach_func_name = mname; env->ops = st_ops->verifier_ops; - return 0; + return bpf_prog_ctx_arg_info_init(prog, st_ops_desc->arg_info[member_idx].info, + st_ops_desc->arg_info[member_idx].cnt); } #define SECURITY_PREFIX "security_" @@ -22543,6 +23087,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; bool tgt_changes_pkt_data; + bool tgt_might_sleep; if (bpf_prog_is_dev_bound(prog->aux) && !bpf_prog_dev_bound_match(prog, tgt_prog)) { @@ -22585,6 +23130,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension program changes packet data, while original does not\n"); return -EINVAL; } + + tgt_might_sleep = aux->func + ? aux->func[subprog]->aux->might_sleep + : aux->might_sleep; + if (prog->aux->might_sleep && !tgt_might_sleep) { + bpf_log(log, + "Extension program may sleep, while original does not\n"); + return -EINVAL; + } } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); @@ -22841,6 +23395,33 @@ BTF_ID(func, __rcu_read_unlock) #endif BTF_SET_END(btf_id_deny) +/* fexit and fmod_ret can't be used to attach to __noreturn functions. + * Currently, we must manually list all __noreturn functions here. Once a more + * robust solution is implemented, this workaround can be removed. + */ +BTF_SET_START(noreturn_deny) +#ifdef CONFIG_IA32_EMULATION +BTF_ID(func, __ia32_sys_exit) +BTF_ID(func, __ia32_sys_exit_group) +#endif +#ifdef CONFIG_KUNIT +BTF_ID(func, __kunit_abort) +BTF_ID(func, kunit_try_catch_throw) +#endif +#ifdef CONFIG_MODULES +BTF_ID(func, __module_put_and_kthread_exit) +#endif +#ifdef CONFIG_X86_64 +BTF_ID(func, __x64_sys_exit) +BTF_ID(func, __x64_sys_exit_group) +#endif +BTF_ID(func, do_exit) +BTF_ID(func, do_group_exit) +BTF_ID(func, kthread_complete_and_exit) +BTF_ID(func, kthread_exit) +BTF_ID(func, make_task_dead) +BTF_SET_END(noreturn_deny) + static bool can_be_sleepable(struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_TRACING) { @@ -22917,9 +23498,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_btf_trace = true; return 0; } else if (prog->expected_attach_type == BPF_TRACE_ITER) { - if (!bpf_iter_prog_supported(prog)) - return -EINVAL; - return 0; + return bpf_iter_prog_supported(prog); } if (prog->type == BPF_PROG_TYPE_LSM) { @@ -22929,6 +23508,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } else if (prog->type == BPF_PROG_TYPE_TRACING && btf_id_set_contains(&btf_id_deny, btf_id)) { return -EINVAL; + } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || + prog->expected_attach_type == BPF_MODIFY_RETURN) && + btf_id_set_contains(&noreturn_deny, btf_id)) { + verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n"); + return -EINVAL; } key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); @@ -23021,6 +23605,302 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } +static bool can_fallthrough(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + if (class != BPF_JMP && class != BPF_JMP32) + return true; + + if (opcode == BPF_EXIT || opcode == BPF_JA) + return false; + + return true; +} + +static bool can_jump(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + if (class != BPF_JMP && class != BPF_JMP32) + return false; + + switch (opcode) { + case BPF_JA: + case BPF_JEQ: + case BPF_JNE: + case BPF_JLT: + case BPF_JLE: + case BPF_JGT: + case BPF_JGE: + case BPF_JSGT: + case BPF_JSGE: + case BPF_JSLT: + case BPF_JSLE: + case BPF_JCOND: + return true; + } + + return false; +} + +static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +{ + struct bpf_insn *insn = &prog->insnsi[idx]; + int i = 0, insn_sz; + u32 dst; + + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + if (can_fallthrough(insn) && idx + 1 < prog->len) + succ[i++] = idx + insn_sz; + + if (can_jump(insn)) { + dst = idx + jmp_offset(insn) + 1; + if (i == 0 || succ[0] != dst) + succ[i++] = dst; + } + + return i; +} + +/* Each field is a register bitmask */ +struct insn_live_regs { + u16 use; /* registers read by instruction */ + u16 def; /* registers written by instruction */ + u16 in; /* registers that may be alive before instruction */ + u16 out; /* registers that may be alive after instruction */ +}; + +/* Bitmask with 1s for all caller saved registers */ +#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) + +/* Compute info->{use,def} fields for the instruction */ +static void compute_insn_live_regs(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct insn_live_regs *info) +{ + struct call_summary cs; + u8 class = BPF_CLASS(insn->code); + u8 code = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u16 src = BIT(insn->src_reg); + u16 dst = BIT(insn->dst_reg); + u16 r0 = BIT(0); + u16 def = 0; + u16 use = 0xffff; + + switch (class) { + case BPF_LD: + switch (mode) { + case BPF_IMM: + if (BPF_SIZE(insn->code) == BPF_DW) { + def = dst; + use = 0; + } + break; + case BPF_LD | BPF_ABS: + case BPF_LD | BPF_IND: + /* stick with defaults */ + break; + } + break; + case BPF_LDX: + switch (mode) { + case BPF_MEM: + case BPF_MEMSX: + def = dst; + use = src; + break; + } + break; + case BPF_ST: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst; + break; + } + break; + case BPF_STX: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst | src; + break; + case BPF_ATOMIC: + switch (insn->imm) { + case BPF_CMPXCHG: + use = r0 | dst | src; + def = r0; + break; + case BPF_LOAD_ACQ: + def = dst; + use = src; + break; + case BPF_STORE_REL: + def = 0; + use = dst | src; + break; + default: + use = dst | src; + if (insn->imm & BPF_FETCH) + def = src; + else + def = 0; + } + break; + } + break; + case BPF_ALU: + case BPF_ALU64: + switch (code) { + case BPF_END: + use = dst; + def = dst; + break; + case BPF_MOV: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = 0; + else + use = src; + break; + default: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + case BPF_JMP: + case BPF_JMP32: + switch (code) { + case BPF_JA: + case BPF_JCOND: + def = 0; + use = 0; + break; + case BPF_EXIT: + def = 0; + use = r0; + break; + case BPF_CALL: + def = ALL_CALLER_SAVED_REGS; + use = def & ~BIT(BPF_REG_0); + if (get_call_summary(env, insn, &cs)) + use = GENMASK(cs.num_params, 1); + break; + default: + def = 0; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + } + + info->def = def; + info->use = use; +} + +/* Compute may-live registers after each instruction in the program. + * The register is live after the instruction I if it is read by some + * instruction S following I during program execution and is not + * overwritten between I and S. + * + * Store result in env->insn_aux_data[i].live_regs. + */ +static int compute_live_registers(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *insn_aux = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + struct insn_live_regs *state; + int insn_cnt = env->prog->len; + int err = 0, i, j; + bool changed; + + /* Use the following algorithm: + * - define the following: + * - I.use : a set of all registers read by instruction I; + * - I.def : a set of all registers written by instruction I; + * - I.in : a set of all registers that may be alive before I execution; + * - I.out : a set of all registers that may be alive after I execution; + * - insn_successors(I): a set of instructions S that might immediately + * follow I for some program execution; + * - associate separate empty sets 'I.in' and 'I.out' with each instruction; + * - visit each instruction in a postorder and update + * state[i].in, state[i].out as follows: + * + * state[i].out = U [state[s].in for S in insn_successors(i)] + * state[i].in = (state[i].out / state[i].def) U state[i].use + * + * (where U stands for set union, / stands for set difference) + * - repeat the computation while {in,out} fields changes for + * any instruction. + */ + state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL); + if (!state) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < insn_cnt; ++i) + compute_insn_live_regs(env, &insns[i], &state[i]); + + changed = true; + while (changed) { + changed = false; + for (i = 0; i < env->cfg.cur_postorder; ++i) { + int insn_idx = env->cfg.insn_postorder[i]; + struct insn_live_regs *live = &state[insn_idx]; + int succ_num; + u32 succ[2]; + u16 new_out = 0; + u16 new_in = 0; + + succ_num = insn_successors(env->prog, insn_idx, succ); + for (int s = 0; s < succ_num; ++s) + new_out |= state[succ[s]].in; + new_in = (new_out & ~live->def) | live->use; + if (new_out != live->out || new_in != live->in) { + live->in = new_in; + live->out = new_out; + changed = true; + } + } + } + + for (i = 0; i < insn_cnt; ++i) + insn_aux[i].live_regs_before = state[i].in; + + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "Live regs before insn:\n"); + for (i = 0; i < insn_cnt; ++i) { + verbose(env, "%3d: ", i); + for (j = BPF_REG_0; j < BPF_REG_10; ++j) + if (insn_aux[i].live_regs_before & BIT(j)) + verbose(env, "%d", j); + else + verbose(env, "."); + verbose(env, " "); + verbose_insn(env, &insns[i]); + if (bpf_is_ldimm64(&insns[i])) + i++; + } + } + +out: + kvfree(state); + kvfree(env->cfg.insn_postorder); + env->cfg.insn_postorder = NULL; + env->cfg.cur_postorder = 0; + return err; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); @@ -23098,12 +23978,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS; env->explored_states = kvcalloc(state_htab_size(env), - sizeof(struct bpf_verifier_state_list *), + sizeof(struct list_head), GFP_USER); ret = -ENOMEM; if (!env->explored_states) goto skip_full_check; + for (i = 0; i < state_htab_size(env); i++) + INIT_LIST_HEAD(&env->explored_states[i]); + INIT_LIST_HEAD(&env->free_list); + ret = check_btf_info_early(env, attr, uattr); if (ret < 0) goto skip_full_check; @@ -23138,6 +24022,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret) goto skip_full_check; + ret = compute_live_registers(env); + if (ret < 0) + goto skip_full_check; + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; @@ -23276,6 +24164,7 @@ err_unlock: vfree(env->insn_aux_data); kvfree(env->insn_hist); err_free_env: + kvfree(env->cfg.insn_postorder); kvfree(env); return ret; } diff --git a/kernel/capability.c b/kernel/capability.c index e089d2628c29..829f49ae07b9 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -286,22 +286,6 @@ bool has_ns_capability(struct task_struct *t, } /** - * has_capability - Does a task have a capability in init_user_ns - * @t: The task in question - * @cap: The capability to be tested for - * - * Return true if the specified task has the given superior capability - * currently in effect to the initial user namespace, false if not. - * - * Note that this does not set PF_SUPERPRIV on the task. - */ -bool has_capability(struct task_struct *t, int cap) -{ - return has_ns_capability(t, &init_user_ns, cap); -} -EXPORT_SYMBOL(has_capability); - -/** * has_ns_capability_noaudit - Does a task have a capability (unaudited) * in a specific user ns. * @t: The task in question diff --git a/kernel/cfi.c b/kernel/cfi.c index 08caad776717..422fa4f958ae 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -7,6 +7,8 @@ #include <linux/cfi.h> +bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE); + enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, unsigned long *target, u32 type) { @@ -17,7 +19,7 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, pr_err("CFI failure at %pS (no target information)\n", (void *)addr); - if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) { + if (cfi_warn) { __warn(NULL, 0, (void *)addr, 0, regs, NULL); return BUG_TRAP_TYPE_WARN; } @@ -71,14 +73,11 @@ static bool is_module_cfi_trap(unsigned long addr) struct module *mod; bool found = false; - rcu_read_lock_sched_notrace(); - + guard(rcu)(); mod = __module_address(addr); if (mod) found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end); - rcu_read_unlock_sched_notrace(); - return found; } #else /* CONFIG_MODULES */ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c964dd7ff967..95ab39e1ec8f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -168,6 +168,7 @@ struct cgroup_mgctx { extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; +extern bool cgrp_dfl_visible; /* iterate across the hierarchies */ #define for_each_root(root) \ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index e28d5f0d20ed..11ea8d24ac72 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -673,6 +673,7 @@ struct cftype cgroup1_base_files[] = { int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; + bool cgrp_v1_visible = false; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); @@ -684,12 +685,18 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) { if (cgroup1_subsys_absent(ss)) continue; + cgrp_v1_visible |= ss->root != &cgrp_dfl_root; + seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); } + if (cgrp_dfl_visible && !cgrp_v1_visible) + pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n"); + + return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d9061bd55436..f231fe3a0744 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_visible; +bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; @@ -4013,7 +4013,7 @@ static void __cgroup_kill(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - set_bit(CGRP_KILL, &cgrp->flags); + cgrp->kill_seq++; spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); @@ -4029,10 +4029,6 @@ static void __cgroup_kill(struct cgroup *cgrp) send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); - - spin_lock_irq(&css_set_lock); - clear_bit(CGRP_KILL, &cgrp->flags); - spin_unlock_irq(&css_set_lock); } static void cgroup_kill(struct cgroup *cgrp) @@ -4451,7 +4447,7 @@ int cgroup_rm_cftypes(struct cftype *cfts) * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; @@ -5835,7 +5831,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) } /* - * This extra ref will be put in cgroup_free_fn() and guarantees + * This extra ref will be put in css_free_rwork_fn() and guarantees * that @cgrp->kn is always accessible. */ kernfs_get(cgrp->kn); @@ -6488,6 +6484,10 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); + if (kargs->cgrp) + kargs->kill_seq = kargs->cgrp->kill_seq; + else + kargs->kill_seq = cset->dfl_cgrp->kill_seq; spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { @@ -6668,6 +6668,7 @@ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + unsigned int cgrp_kill_seq = 0; unsigned long cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; @@ -6681,10 +6682,13 @@ void cgroup_post_fork(struct task_struct *child, /* init tasks are special, only link regular threads */ if (likely(child->pid)) { - if (kargs->cgrp) + if (kargs->cgrp) { cgrp_flags = kargs->cgrp->flags; - else + cgrp_kill_seq = kargs->cgrp->kill_seq; + } else { cgrp_flags = cset->dfl_cgrp->flags; + cgrp_kill_seq = cset->dfl_cgrp->kill_seq; + } WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; @@ -6719,7 +6723,7 @@ void cgroup_post_fork(struct task_struct *child, * child down right after we finished preparing it for * userspace. */ - kill = test_bit(CGRP_KILL, &cgrp_flags); + kill = kargs->kill_seq != cgrp_kill_seq; } spin_unlock_irq(&css_set_lock); diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 25c1d7b77e2f..b69a7db67090 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include "cgroup-internal.h" #include "cpuset-internal.h" /* @@ -175,6 +176,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = update_relax_domain_level(cs, val); break; default: @@ -373,6 +375,46 @@ out: return ret; } +#ifdef CONFIG_PROC_PID_CPUSET +/* + * proc_cpuset_show() + * - Print tasks cpuset path into seq_file. + * - Used for /proc/<pid>/cpuset. + */ +int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + char *buf; + struct cgroup_subsys_state *css; + int retval; + + retval = -ENOMEM; + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + goto out; + + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + + if (retval == -E2BIG) + retval = -ENAMETOOLONG; + if (retval < 0) + goto out_free; + seq_puts(m, buf); + seq_putc(m, '\n'); + retval = 0; +out_free: + kfree(buf); +out: + return retval; +} +#endif /* CONFIG_PROC_PID_CPUSET */ + static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct cpuset *cs = css_cs(css); @@ -424,24 +466,31 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); break; case FILE_MEM_HARDWALL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); break; case FILE_SCHED_LOAD_BALANCE: + pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name); retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: + pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name); cpuset_memory_pressure_enabled = !!val; break; case FILE_SPREAD_PAGE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); break; case FILE_SPREAD_SLAB: + pr_warn_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); break; default: diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0f910c828973..39c1fc643d77 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,7 +21,6 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ -#include "cgroup-internal.h" #include "cpuset-internal.h" #include <linux/init.h> @@ -954,10 +953,12 @@ static void dl_update_tasks_root_domain(struct cpuset *cs) css_task_iter_end(&it); } -static void dl_rebuild_rd_accounting(void) +void dl_rebuild_rd_accounting(void) { struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; + int cpu; + u64 cookie = ++dl_cookie; lockdep_assert_held(&cpuset_mutex); lockdep_assert_cpus_held(); @@ -965,11 +966,12 @@ static void dl_rebuild_rd_accounting(void) rcu_read_lock(); - /* - * Clear default root domain DL accounting, it will be computed again - * if a task belongs to it. - */ - dl_clear_root_domain(&def_root_domain); + for_each_possible_cpu(cpu) { + if (dl_bw_visited(cpu, cookie)) + continue; + + dl_clear_root_domain_cpu(cpu); + } cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { @@ -990,16 +992,6 @@ static void dl_rebuild_rd_accounting(void) rcu_read_unlock(); } -static void -partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ - mutex_lock(&sched_domains_mutex); - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - dl_rebuild_rd_accounting(); - mutex_unlock(&sched_domains_mutex); -} - /* * Rebuild scheduler domains. * @@ -1061,7 +1053,7 @@ void rebuild_sched_domains_locked(void) ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); + partition_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ void rebuild_sched_domains_locked(void) @@ -1083,6 +1075,13 @@ void rebuild_sched_domains(void) cpus_read_unlock(); } +void cpuset_reset_sched_domains(void) +{ + mutex_lock(&cpuset_mutex); + partition_sched_domains(1, NULL, NULL); + mutex_unlock(&cpuset_mutex); +} + /** * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -4244,50 +4243,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -#ifdef CONFIG_PROC_PID_CPUSET -/* - * proc_cpuset_show() - * - Print tasks cpuset path into seq_file. - * - Used for /proc/<pid>/cpuset. - * - No need to task_lock(tsk) on this tsk->cpuset reference, as it - * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it - * anyway. - */ -int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - char *buf; - struct cgroup_subsys_state *css; - int retval; - - retval = -ENOMEM; - buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!buf) - goto out; - - rcu_read_lock(); - spin_lock_irq(&css_set_lock); - css = task_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - spin_unlock_irq(&css_set_lock); - rcu_read_unlock(); - - if (retval == -E2BIG) - retval = -ENAMETOOLONG; - if (retval < 0) - goto out_free; - seq_puts(m, buf); - seq_putc(m, '\n'); - retval = 0; -out_free: - kfree(buf); -out: - return retval; -} -#endif /* CONFIG_PROC_PID_CPUSET */ - /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index fbe34299673d..10b63433f057 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -220,60 +220,32 @@ dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool, struct dmem_cgroup_pool_state *test_pool) { struct page_counter *climit; - struct cgroup_subsys_state *css, *next_css; + struct cgroup_subsys_state *css; struct dmemcg_state *dmemcg_iter; - struct dmem_cgroup_pool_state *pool, *parent_pool; - bool found_descendant; + struct dmem_cgroup_pool_state *pool, *found_pool; climit = &limit_pool->cnt; rcu_read_lock(); - parent_pool = pool = limit_pool; - css = &limit_pool->cs->css; - /* - * This logic is roughly equivalent to css_foreach_descendant_pre, - * except we also track the parent pool to find out which pool we need - * to calculate protection values for. - * - * We can stop the traversal once we find test_pool among the - * descendants since we don't really care about any others. - */ - while (pool != test_pool) { - next_css = css_next_child(NULL, css); - if (next_css) { - parent_pool = pool; - } else { - while (css != &limit_pool->cs->css) { - next_css = css_next_child(css, css->parent); - if (next_css) - break; - css = css->parent; - parent_pool = pool_parent(parent_pool); - } - /* - * We can only hit this when test_pool is not a - * descendant of limit_pool. - */ - if (WARN_ON_ONCE(css == &limit_pool->cs->css)) - break; - } - css = next_css; - - found_descendant = false; + css_for_each_descendant_pre(css, &limit_pool->cs->css) { dmemcg_iter = container_of(css, struct dmemcg_state, css); + found_pool = NULL; list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) { - if (pool_parent(pool) == parent_pool) { - found_descendant = true; + if (pool->region == limit_pool->region) { + found_pool = pool; break; } } - if (!found_descendant) + if (!found_pool) continue; page_counter_calculate_protection( - climit, &pool->cnt, true); + climit, &found_pool->cnt, true); + + if (found_pool == test_pool) + break; } rcu_read_unlock(); } diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 074653f964c1..039d1eb2f215 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -430,9 +430,11 @@ static ssize_t freezer_write(struct kernfs_open_file *of, if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; - else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) { + pr_info_once("Freezing with imperfect legacy cgroup freezer. " + "See cgroup.freeze of cgroup v2\n"); freeze = true; - else + } else return -EINVAL; freezer_change_state(css_freezer(of_css(of)), freeze); diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 0e26068995a6..2fa3a4fb2aaf 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -68,22 +68,6 @@ static inline bool valid_type(enum misc_res_type type) } /** - * misc_cg_res_total_usage() - Get the current total usage of the resource. - * @type: misc res type. - * - * Context: Any context. - * Return: Current total usage of the resource. - */ -u64 misc_cg_res_total_usage(enum misc_res_type type) -{ - if (valid_type(type)) - return atomic64_read(&root_cg.res[type].usage); - - return 0; -} -EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); - -/** * misc_cg_set_capacity() - Set the capacity of the misc cgroup res. * @type: Type of the misc res. * @capacity: Supported capacity of the misc res on the host. diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 5877974ece92..4bb587d5d34f 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -299,40 +299,6 @@ static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) spin_unlock_irq(&cgroup_rstat_lock); } -/* see cgroup_rstat_flush() */ -static void cgroup_rstat_flush_locked(struct cgroup *cgrp) - __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) -{ - int cpu; - - lockdep_assert_held(&cgroup_rstat_lock); - - for_each_possible_cpu(cpu) { - struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); - - for (; pos; pos = pos->rstat_flush_next) { - struct cgroup_subsys_state *css; - - cgroup_base_stat_flush(pos, cpu); - bpf_rstat_flush(pos, cgroup_parent(pos), cpu); - - rcu_read_lock(); - list_for_each_entry_rcu(css, &pos->rstat_css_list, - rstat_css_node) - css->ss->css_rstat_flush(css, cpu); - rcu_read_unlock(); - } - - /* play nice and yield if necessary */ - if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - __cgroup_rstat_unlock(cgrp, cpu); - if (!cond_resched()) - cpu_relax(); - __cgroup_rstat_lock(cgrp, cpu); - } - } -} - /** * cgroup_rstat_flush - flush stats in @cgrp's subtree * @cgrp: target cgroup @@ -348,38 +314,30 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) */ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) { + int cpu; + might_sleep(); + for_each_possible_cpu(cpu) { + struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); - __cgroup_rstat_lock(cgrp, -1); - cgroup_rstat_flush_locked(cgrp); - __cgroup_rstat_unlock(cgrp, -1); -} + /* Reacquire for each CPU to avoid disabling IRQs too long */ + __cgroup_rstat_lock(cgrp, cpu); + for (; pos; pos = pos->rstat_flush_next) { + struct cgroup_subsys_state *css; -/** - * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold - * @cgrp: target cgroup - * - * Flush stats in @cgrp's subtree and prevent further flushes. Must be - * paired with cgroup_rstat_flush_release(). - * - * This function may block. - */ -void cgroup_rstat_flush_hold(struct cgroup *cgrp) - __acquires(&cgroup_rstat_lock) -{ - might_sleep(); - __cgroup_rstat_lock(cgrp, -1); - cgroup_rstat_flush_locked(cgrp); -} + cgroup_base_stat_flush(pos, cpu); + bpf_rstat_flush(pos, cgroup_parent(pos), cpu); -/** - * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() - * @cgrp: cgroup used by tracepoint - */ -void cgroup_rstat_flush_release(struct cgroup *cgrp) - __releases(&cgroup_rstat_lock) -{ - __cgroup_rstat_unlock(cgrp, -1); + rcu_read_lock(); + list_for_each_entry_rcu(css, &pos->rstat_css_list, + rstat_css_node) + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + } + __cgroup_rstat_unlock(cgrp, cpu); + if (!cond_resched()) + cpu_relax(); + } } int cgroup_rstat_init(struct cgroup *cgrp) @@ -590,7 +548,6 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; - cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; @@ -613,36 +570,34 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime, ntime; + struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { - cgroup_rstat_flush_hold(cgrp); - usage = cgrp->bstat.cputime.sum_exec_runtime; + cgroup_rstat_flush(cgrp); + __cgroup_rstat_lock(cgrp, -1); + bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, - &utime, &stime); - ntime = cgrp->bstat.ntime; - cgroup_rstat_flush_release(cgrp); + &bstat.cputime.utime, &bstat.cputime.stime); + __cgroup_rstat_unlock(cgrp, -1); } else { - /* cgrp->bstat of root is not actually used, reuse it */ - root_cgroup_cputime(&cgrp->bstat); - usage = cgrp->bstat.cputime.sum_exec_runtime; - utime = cgrp->bstat.cputime.utime; - stime = cgrp->bstat.cputime.stime; - ntime = cgrp->bstat.ntime; + root_cgroup_cputime(&bstat); } - do_div(usage, NSEC_PER_USEC); - do_div(utime, NSEC_PER_USEC); - do_div(stime, NSEC_PER_USEC); - do_div(ntime, NSEC_PER_USEC); + do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); + do_div(bstat.cputime.utime, NSEC_PER_USEC); + do_div(bstat.cputime.stime, NSEC_PER_USEC); + do_div(bstat.ntime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n" "nice_usec %llu\n", - usage, utime, stime, ntime); + bstat.cputime.sum_exec_runtime, + bstat.cputime.utime, + bstat.cputime.stime, + bstat.ntime); - cgroup_force_idle_show(seq, &cgrp->bstat); + cgroup_force_idle_show(seq, &bstat); } /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 3fabb8f55ef6..dd7c32fb5ac1 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -46,7 +46,7 @@ CONFIG_UBSAN_BOUNDS=y # CONFIG_UBSAN_SHIFT is not set # CONFIG_UBSAN_DIV_ZERO is not set # CONFIG_UBSAN_UNREACHABLE is not set -# CONFIG_UBSAN_SIGNED_WRAP is not set +# CONFIG_UBSAN_INTEGER_WRAP is not set # CONFIG_UBSAN_BOOL is not set # CONFIG_UBSAN_ENUM is not set # CONFIG_UBSAN_ALIGNMENT is not set diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 938c48952d26..fb5be6e9b423 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -80,17 +80,16 @@ static __always_inline void rcu_task_trace_heavyweight_exit(void) */ static noinstr void ct_kernel_exit_state(int offset) { - int seq; - /* * CPUs seeing atomic_add_return() must see prior RCU read-side * critical sections, and we also must force ordering with the * next idle sojourn. */ rcu_task_trace_heavyweight_enter(); // Before CT state update! - seq = ct_state_inc(offset); - // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING)); + // RCU is still watching. Better not be in extended quiescent state! + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu()); + (void)ct_state_inc(offset); + // RCU is no longer watching. } /* diff --git a/kernel/cpu.c b/kernel/cpu.c index 07455d25329c..b08bb34b1718 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -526,6 +526,7 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(&cpu_hotplug_lock); } +EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held); #ifdef CONFIG_LOCKDEP int lockdep_is_cpus_held(void) @@ -1453,11 +1454,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, out: cpus_write_unlock(); - /* - * Do post unplug cleanup. This is still protected against - * concurrent CPU hotplug via cpu_add_remove_lock. - */ - lockup_detector_cleanup(); arch_smt_update(); return ret; } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 078fe5bc5a74..335b8425dd4b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -436,7 +436,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) memset(&prstatus, 0, sizeof(prstatus)); prstatus.common.pr_pid = current->pid; elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); } diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 095c775e001e..d4b8bd0af79b 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -6,6 +6,9 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n +# Branch profiling isn't noinstr-safe +ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING + CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector diff --git a/kernel/entry/common.c b/kernel/entry/common.c index e33691d5adf7..20154572ede9 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -49,7 +49,7 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Do seccomp after ptrace, to catch any tracer changes. */ if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(NULL); + ret = __secure_computing(); if (ret == -1L) return ret; } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 8a47e52a454f..6c83ad674d01 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -22,6 +22,7 @@ struct callchain_cpus_entries { int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; +static const int six_hundred_forty_kb = 640 * 1024; static inline size_t perf_callchain_entry__sizeof(void) { @@ -266,12 +267,8 @@ exit_put: return entry; } -/* - * Used for sysctl_perf_event_max_stack and - * sysctl_perf_event_max_contexts_per_stack. - */ -int perf_event_max_stack_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int perf_event_max_stack_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; @@ -292,3 +289,32 @@ int perf_event_max_stack_handler(const struct ctl_table *table, int write, return ret; } + +static const struct ctl_table callchain_sysctl_table[] = { + { + .procname = "perf_event_max_stack", + .data = &sysctl_perf_event_max_stack, + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&six_hundred_forty_kb, + }, + { + .procname = "perf_event_max_contexts_per_stack", + .data = &sysctl_perf_event_max_contexts_per_stack, + .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_THOUSAND, + }, +}; + +static int __init init_callchain_sysctls(void) +{ + register_sysctl_init("kernel", callchain_sysctl_table); + return 0; +} +core_initcall(init_callchain_sysctls); + diff --git a/kernel/events/core.c b/kernel/events/core.c index bcb09e011e9e..0bb21659e252 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -55,6 +55,7 @@ #include <linux/pgtable.h> #include <linux/buildid.h> #include <linux/task_work.h> +#include <linux/percpu-rwsem.h> #include "internal.h" @@ -452,8 +453,8 @@ static struct kmem_cache *perf_event_cache; */ int sysctl_perf_event_paranoid __read_mostly = 2; -/* Minimum for 512 kiB + 1 user control page */ -int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ +/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ +static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* * max perf event sample rate @@ -463,6 +464,7 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; @@ -484,7 +486,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); -int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, +static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -506,9 +508,7 @@ int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, return 0; } -int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; - -int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, +static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -528,6 +528,52 @@ int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, return 0; } +static const struct ctl_table events_core_sysctl_table[] = { + /* + * User-space relies on this file as a feature check for + * perf_events being enabled. It's an ABI, do not remove! + */ + { + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), + .mode = 0644, + .proc_handler = perf_event_max_sample_rate_handler, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +}; + +static int __init init_events_core_sysctls(void) +{ + register_sysctl_init("kernel", events_core_sysctl_table); + return 0; +} +core_initcall(init_events_core_sysctls); + + /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not @@ -1147,8 +1193,8 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpc->hrtimer_lock); - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - timer->function = perf_mux_hrtimer_handler; + hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_HARD); } static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) @@ -1172,42 +1218,40 @@ static int perf_mux_hrtimer_restart_ipi(void *arg) return perf_mux_hrtimer_restart(arg); } +static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu) +{ + return *this_cpu_ptr(pmu->cpu_pmu_context); +} + void perf_pmu_disable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpc(pmu)->pmu_disable_count; if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpc(pmu)->pmu_disable_count; if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { - WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); -} - -static void get_ctx(struct perf_event_context *ctx) -{ - refcount_inc(&ctx->refcount); + int *count = &this_cpc(pmu)->pmu_disable_count; + WARN_ON_ONCE(*count == 0); } -static void *alloc_task_ctx_data(struct pmu *pmu) +static inline void perf_pmu_read(struct perf_event *event) { - if (pmu->task_ctx_cache) - return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); - - return NULL; + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); } -static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) +static void get_ctx(struct perf_event_context *ctx) { - if (pmu->task_ctx_cache && task_ctx_data) - kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); + refcount_inc(&ctx->refcount); } static void free_ctx(struct rcu_head *head) @@ -2303,7 +2347,7 @@ static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; // XXX cpc serialization, probably per-cpu IRQ disabled @@ -2444,9 +2488,8 @@ __perf_remove_from_context(struct perf_event *event, pmu_ctx->rotate_necessary = 0; if (ctx->task && ctx->is_active) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu); - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } @@ -2584,7 +2627,7 @@ static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); @@ -2691,7 +2734,7 @@ error: static int group_can_go_on(struct perf_event *event, int can_add_hw) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); /* * Groups consisting entirely of software events can always go on. @@ -3314,9 +3357,8 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, struct pmu *pmu = pmu_ctx->pmu; if (ctx->task && !(ctx->is_active & EVENT_ALL)) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); - cpc = this_cpu_ptr(pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } @@ -3473,8 +3515,7 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + perf_pmu_read(event); perf_event_update_time(event); @@ -3522,52 +3563,17 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \ - for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ - pos2 = list_first_entry(head2, typeof(*pos2), member); \ - !list_entry_is_head(pos1, head1, member) && \ - !list_entry_is_head(pos2, head2, member); \ - pos1 = list_next_entry(pos1, member), \ - pos2 = list_next_entry(pos2, member)) - -static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, - struct perf_event_context *next_ctx) -{ - struct perf_event_pmu_context *prev_epc, *next_epc; - - if (!prev_ctx->nr_task_data) - return; - - double_list_for_each_entry(prev_epc, next_epc, - &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, - pmu_ctx_entry) { - - if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu)) - continue; - - /* - * PMU specific parts of task perf context can require - * additional synchronization. As an example of such - * synchronization see implementation details of Intel - * LBR call stack data profiling; - */ - if (prev_epc->pmu->swap_task_ctx) - prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); - else - swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); - } -} - -static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in) +static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, + struct task_struct *task, bool sched_in) { struct perf_event_pmu_context *pmu_ctx; struct perf_cpu_pmu_context *cpc; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + cpc = this_cpc(pmu_ctx->pmu); if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) - pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); + pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in); } } @@ -3630,17 +3636,16 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - perf_ctx_sched_task_cb(ctx, false); - perf_event_swap_task_ctx_data(ctx, next_ctx); + perf_ctx_sched_task_cb(ctx, task, false); perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of - * ctx->task and ctx->task_ctx_data are immaterial - * since those values are always verified under - * ctx->lock which we're now holding. + * ctx->task is immaterial since this value is + * always verified under ctx->lock which we're now + * holding. */ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); RCU_INIT_POINTER(next->perf_event_ctxp, ctx); @@ -3660,7 +3665,7 @@ unlock: perf_ctx_disable(ctx, false); inside_switch: - perf_ctx_sched_task_cb(ctx, false); + perf_ctx_sched_task_cb(ctx, task, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); perf_ctx_enable(ctx, false); @@ -3673,7 +3678,7 @@ static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); this_cpu_dec(perf_sched_cb_usages); barrier(); @@ -3685,7 +3690,7 @@ void perf_sched_cb_dec(struct pmu *pmu) void perf_sched_cb_inc(struct pmu *pmu) { - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); if (!cpc->sched_cb_usage++) list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); @@ -3702,7 +3707,8 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ -static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in) +static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, + struct task_struct *task, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; @@ -3716,7 +3722,7 @@ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_i perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); - pmu->sched_task(cpc->task_epc, sched_in); + pmu->sched_task(cpc->task_epc, task, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -3734,7 +3740,7 @@ static void perf_pmu_sched_task(struct task_struct *prev, return; list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) - __perf_pmu_sched_task(cpc, sched_in); + __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in); } static void perf_event_switch(struct task_struct *task, @@ -3802,7 +3808,7 @@ static void __link_epc(struct perf_event_pmu_context *pmu_ctx) if (!pmu_ctx->ctx->task) return; - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + cpc = this_cpc(pmu_ctx->pmu); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = pmu_ctx; } @@ -3930,11 +3936,15 @@ static int merge_sched_in(struct perf_event *event, void *data) if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + + if (*perf_event_fasync(event)) + event->pending_kill = POLL_HUP; + + perf_event_wakeup(event); } else { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); event->pmu_ctx->rotate_necessary = 1; - cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } @@ -4029,7 +4039,7 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_ctx_lock(cpuctx, ctx); perf_ctx_disable(ctx, false); - perf_ctx_sched_task_cb(ctx, true); + perf_ctx_sched_task_cb(ctx, task, true); perf_ctx_enable(ctx, false); perf_ctx_unlock(cpuctx, ctx); @@ -4060,7 +4070,7 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_event_sched_in(cpuctx, ctx, NULL); - perf_ctx_sched_task_cb(cpuctx->task_ctx, true); + perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) perf_ctx_enable(&cpuctx->ctx, false); @@ -4618,15 +4628,8 @@ static void __perf_event_read(void *info) pmu->read(event); - for_each_sibling_event(sub, event) { - if (sub->state == PERF_EVENT_STATE_ACTIVE) { - /* - * Use sibling's PMU rather than @event's since - * sibling could be on different (eg: software) PMU. - */ - sub->pmu->read(sub); - } - } + for_each_sibling_event(sub, event) + perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); @@ -4883,7 +4886,7 @@ find_get_context(struct task_struct *task, struct perf_event *event) if (!task) { /* Must be root to operate on a CPU event: */ - err = perf_allow_cpu(&event->attr); + err = perf_allow_cpu(); if (err) return ERR_PTR(err); @@ -4950,8 +4953,7 @@ static struct perf_event_pmu_context * find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, struct perf_event *event) { - struct perf_event_pmu_context *new = NULL, *epc; - void *task_ctx_data = NULL; + struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc; if (!ctx->task) { /* @@ -4961,11 +4963,14 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, */ struct perf_cpu_pmu_context *cpc; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { - atomic_set(&epc->refcount, 1); + /* + * One extra reference for the pmu; see perf_pmu_free(). + */ + atomic_set(&epc->refcount, 2); epc->embedded = 1; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; @@ -4981,14 +4986,6 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, if (!new) return ERR_PTR(-ENOMEM); - if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = alloc_task_ctx_data(pmu); - if (!task_ctx_data) { - kfree(new); - return ERR_PTR(-ENOMEM); - } - } - __perf_init_event_pmu_context(new, pmu); /* @@ -5007,23 +5004,23 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, atomic_inc(&epc->refcount); goto found_epc; } + /* Make sure the pmu_ctx_list is sorted by PMU type: */ + if (!pos && epc->pmu->type > pmu->type) + pos = epc; } epc = new; new = NULL; - list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + if (!pos) + list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + else + list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev); + epc->ctx = ctx; found_epc: - if (task_ctx_data && !epc->task_ctx_data) { - epc->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - ctx->nr_task_data++; - } raw_spin_unlock_irq(&ctx->lock); - - free_task_ctx_data(pmu, task_ctx_data); kfree(new); return epc; @@ -5034,11 +5031,18 @@ static void get_pmu_ctx(struct perf_event_pmu_context *epc) WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); } +static void free_cpc_rcu(struct rcu_head *head) +{ + struct perf_cpu_pmu_context *cpc = + container_of(head, typeof(*cpc), epc.rcu_head); + + kfree(cpc); +} + static void free_epc_rcu(struct rcu_head *head) { struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); - kfree(epc->task_ctx_data); kfree(epc); } @@ -5068,8 +5072,10 @@ static void put_pmu_ctx(struct perf_event_pmu_context *epc) raw_spin_unlock_irqrestore(&ctx->lock, flags); - if (epc->embedded) + if (epc->embedded) { + call_rcu(&epc->rcu_head, free_cpc_rcu); return; + } call_rcu(&epc->rcu_head, free_epc_rcu); } @@ -5145,6 +5151,225 @@ static void unaccount_freq_event(void) atomic_dec(&nr_freq_events); } + +static struct perf_ctx_data * +alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global) +{ + struct perf_ctx_data *cd; + + cd = kzalloc(sizeof(*cd), GFP_KERNEL); + if (!cd) + return NULL; + + cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL); + if (!cd->data) { + kfree(cd); + return NULL; + } + + cd->global = global; + cd->ctx_cache = ctx_cache; + refcount_set(&cd->refcount, 1); + + return cd; +} + +static void free_perf_ctx_data(struct perf_ctx_data *cd) +{ + kmem_cache_free(cd->ctx_cache, cd->data); + kfree(cd); +} + +static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head) +{ + struct perf_ctx_data *cd; + + cd = container_of(rcu_head, struct perf_ctx_data, rcu_head); + free_perf_ctx_data(cd); +} + +static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd) +{ + call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu); +} + +static int +attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache, + bool global) +{ + struct perf_ctx_data *cd, *old = NULL; + + cd = alloc_perf_ctx_data(ctx_cache, global); + if (!cd) + return -ENOMEM; + + for (;;) { + if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) { + if (old) + perf_free_ctx_data_rcu(old); + return 0; + } + + if (!old) { + /* + * After seeing a dead @old, we raced with + * removal and lost, try again to install @cd. + */ + continue; + } + + if (refcount_inc_not_zero(&old->refcount)) { + free_perf_ctx_data(cd); /* unused */ + return 0; + } + + /* + * @old is a dead object, refcount==0 is stable, try and + * replace it with @cd. + */ + } + return 0; +} + +static void __detach_global_ctx_data(void); +DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem); +static refcount_t global_ctx_data_ref; + +static int +attach_global_ctx_data(struct kmem_cache *ctx_cache) +{ + struct task_struct *g, *p; + struct perf_ctx_data *cd; + int ret; + + if (refcount_inc_not_zero(&global_ctx_data_ref)) + return 0; + + guard(percpu_write)(&global_ctx_data_rwsem); + if (refcount_inc_not_zero(&global_ctx_data_ref)) + return 0; +again: + /* Allocate everything */ + scoped_guard (rcu) { + for_each_process_thread(g, p) { + cd = rcu_dereference(p->perf_ctx_data); + if (cd && !cd->global) { + cd->global = 1; + if (!refcount_inc_not_zero(&cd->refcount)) + cd = NULL; + } + if (!cd) { + get_task_struct(p); + goto alloc; + } + } + } + + refcount_set(&global_ctx_data_ref, 1); + + return 0; +alloc: + ret = attach_task_ctx_data(p, ctx_cache, true); + put_task_struct(p); + if (ret) { + __detach_global_ctx_data(); + return ret; + } + goto again; +} + +static int +attach_perf_ctx_data(struct perf_event *event) +{ + struct task_struct *task = event->hw.target; + struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache; + int ret; + + if (!ctx_cache) + return -ENOMEM; + + if (task) + return attach_task_ctx_data(task, ctx_cache, false); + + ret = attach_global_ctx_data(ctx_cache); + if (ret) + return ret; + + event->attach_state |= PERF_ATTACH_GLOBAL_DATA; + return 0; +} + +static void +detach_task_ctx_data(struct task_struct *p) +{ + struct perf_ctx_data *cd; + + scoped_guard (rcu) { + cd = rcu_dereference(p->perf_ctx_data); + if (!cd || !refcount_dec_and_test(&cd->refcount)) + return; + } + + /* + * The old ctx_data may be lost because of the race. + * Nothing is required to do for the case. + * See attach_task_ctx_data(). + */ + if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL)) + perf_free_ctx_data_rcu(cd); +} + +static void __detach_global_ctx_data(void) +{ + struct task_struct *g, *p; + struct perf_ctx_data *cd; + +again: + scoped_guard (rcu) { + for_each_process_thread(g, p) { + cd = rcu_dereference(p->perf_ctx_data); + if (!cd || !cd->global) + continue; + cd->global = 0; + get_task_struct(p); + goto detach; + } + } + return; +detach: + detach_task_ctx_data(p); + put_task_struct(p); + goto again; +} + +static void detach_global_ctx_data(void) +{ + if (refcount_dec_not_one(&global_ctx_data_ref)) + return; + + guard(percpu_write)(&global_ctx_data_rwsem); + if (!refcount_dec_and_test(&global_ctx_data_ref)) + return; + + /* remove everything */ + __detach_global_ctx_data(); +} + +static void detach_perf_ctx_data(struct perf_event *event) +{ + struct task_struct *task = event->hw.target; + + event->attach_state &= ~PERF_ATTACH_TASK_DATA; + + if (task) + return detach_task_ctx_data(task); + + if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) { + detach_global_ctx_data(); + event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA; + } +} + static void unaccount_event(struct perf_event *event) { bool dec = false; @@ -5239,6 +5464,8 @@ static int exclusive_event_init(struct perf_event *event) return -EBUSY; } + event->attach_state |= PERF_ATTACH_EXCLUSIVE; + return 0; } @@ -5246,14 +5473,13 @@ static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!is_exclusive_pmu(pmu)) - return; - /* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK) atomic_dec(&pmu->exclusive_cnt); else atomic_inc(&pmu->exclusive_cnt); + + event->attach_state &= ~PERF_ATTACH_EXCLUSIVE; } static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) @@ -5285,8 +5511,7 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void perf_addr_filters_splice(struct perf_event *event, - struct list_head *head); +static void perf_free_addr_filters(struct perf_event *event); static void perf_pending_task_sync(struct perf_event *event) { @@ -5312,39 +5537,22 @@ static void perf_pending_task_sync(struct perf_event *event) rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); } -static void _free_event(struct perf_event *event) +/* vs perf_event_alloc() error */ +static void __free_event(struct perf_event *event) { - irq_work_sync(&event->pending_irq); - irq_work_sync(&event->pending_disable_irq); - perf_pending_task_sync(event); + if (event->attach_state & PERF_ATTACH_CALLCHAIN) + put_callchain_buffers(); - unaccount_event(event); + kfree(event->addr_filter_ranges); - security_perf_event_free(event); - - if (event->rb) { - /* - * Can happen when we close an event with re-directed output. - * - * Since we have a 0 refcount, perf_mmap_close() will skip - * over us; possibly making our ring_buffer_put() the last. - */ - mutex_lock(&event->mmap_mutex); - ring_buffer_attach(event, NULL); - mutex_unlock(&event->mmap_mutex); - } + if (event->attach_state & PERF_ATTACH_EXCLUSIVE) + exclusive_event_destroy(event); if (is_cgroup_event(event)) perf_detach_cgroup(event); - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - - perf_event_free_bpf_prog(event); - perf_addr_filters_splice(event, NULL); - kfree(event->addr_filter_ranges); + if (event->attach_state & PERF_ATTACH_TASK_DATA) + detach_perf_ctx_data(event); if (event->destroy) event->destroy(event); @@ -5356,22 +5564,60 @@ static void _free_event(struct perf_event *event) if (event->hw.target) put_task_struct(event->hw.target); - if (event->pmu_ctx) + if (event->pmu_ctx) { + /* + * put_pmu_ctx() needs an event->ctx reference, because of + * epc->ctx. + */ + WARN_ON_ONCE(!event->ctx); + WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); put_pmu_ctx(event->pmu_ctx); + } /* - * perf_event_free_task() relies on put_ctx() being 'last', in particular - * all task references must be cleaned up. + * perf_event_free_task() relies on put_ctx() being 'last', in + * particular all task references must be cleaned up. */ if (event->ctx) put_ctx(event->ctx); - exclusive_event_destroy(event); - module_put(event->pmu->module); + if (event->pmu) + module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } +DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) + +/* vs perf_event_alloc() success */ +static void _free_event(struct perf_event *event) +{ + irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); + perf_pending_task_sync(event); + + unaccount_event(event); + + security_perf_event_free(event); + + if (event->rb) { + /* + * Can happen when we close an event with re-directed output. + * + * Since we have a 0 refcount, perf_mmap_close() will skip + * over us; possibly making our ring_buffer_put() the last. + */ + mutex_lock(&event->mmap_mutex); + ring_buffer_attach(event, NULL); + mutex_unlock(&event->mmap_mutex); + } + + perf_event_free_bpf_prog(event); + perf_free_addr_filters(event); + + __free_event(event); +} + /* * Used to free events which have a known refcount of 1, such as in error paths * where the event isn't exposed yet and inherited events. @@ -5840,6 +6086,10 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) if (is_event_hup(event)) return events; + if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR && + event->attr.pinned)) + return events; + /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. @@ -5962,14 +6212,15 @@ static int _perf_event_period(struct perf_event *event, u64 value) if (!value) return -EINVAL; - if (event->attr.freq && value > sysctl_perf_event_sample_rate) - return -EINVAL; - - if (perf_event_check_period(event, value)) - return -EINVAL; - - if (!event->attr.freq && (value & (1ULL << 63))) - return -EINVAL; + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) + return -EINVAL; + } else { + if (perf_event_check_period(event, value)) + return -EINVAL; + if (value & (1ULL << 63)) + return -EINVAL; + } event_function_call(event, __perf_event_period, &value); @@ -6608,7 +6859,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; - int ret = 0, flags = 0; + int ret, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would @@ -6626,9 +6877,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return ret; vma_size = vma->vm_end - vma->vm_start; + nr_pages = vma_size / PAGE_SIZE; + + if (nr_pages > INT_MAX) + return -ENOMEM; + + if (vma_size != PAGE_SIZE * nr_pages) + return -EINVAL; + + user_extra = nr_pages; + + mutex_lock(&event->mmap_mutex); + ret = -EINVAL; if (vma->vm_pgoff == 0) { - nr_pages = (vma_size / PAGE_SIZE) - 1; + nr_pages -= 1; + + /* + * If we have rb pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + goto unlock; + + WARN_ON_ONCE(event->ctx->parent_ctx); + + if (event->rb) { + if (data_page_nr(event->rb) != nr_pages) + goto unlock; + + if (atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Success -- managed to mmap() the same buffer + * multiple times. + */ + ret = 0; + /* We need the rb to map pages. */ + rb = event->rb; + goto unlock; + } + + /* + * Raced against perf_mmap_close()'s + * atomic_dec_and_mutex_lock() remove the + * event and continue as if !event->rb + */ + ring_buffer_attach(event, NULL); + } + } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -6637,16 +6933,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) */ u64 aux_offset, aux_size; - if (!event->rb) - return -EINVAL; - - nr_pages = vma_size / PAGE_SIZE; - if (nr_pages > INT_MAX) - return -ENOMEM; - - mutex_lock(&event->mmap_mutex); - ret = -EINVAL; - rb = event->rb; if (!rb) goto aux_unlock; @@ -6687,48 +6973,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } atomic_set(&rb->aux_mmap_count, 1); - user_extra = nr_pages; - - goto accounting; - } - - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - WARN_ON_ONCE(event->ctx->parent_ctx); -again: - mutex_lock(&event->mmap_mutex); - if (event->rb) { - if (data_page_nr(event->rb) != nr_pages) { - ret = -EINVAL; - goto unlock; - } - - if (!atomic_inc_not_zero(&event->rb->mmap_count)) { - /* - * Raced against perf_mmap_close(); remove the - * event and try again. - */ - ring_buffer_attach(event, NULL); - mutex_unlock(&event->mmap_mutex); - goto again; - } - - /* We need the rb to map pages. */ - rb = event->rb; - goto unlock; } - user_extra = nr_pages + 1; - -accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* @@ -6796,6 +7042,8 @@ accounting: rb->aux_mmap_locked = extra; } + ret = 0; + unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); @@ -6820,7 +7068,7 @@ aux_unlock: if (!ret) ret = map_range(rb, vma); - if (event->pmu->event_mapped) + if (!ret && event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); return ret; @@ -7444,9 +7692,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if ((leader != event) && - (leader->state == PERF_EVENT_STATE_ACTIVE)) - leader->pmu->read(leader); + if ((leader != event) && !handle->skip_read) + perf_pmu_read(leader); values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) @@ -7459,9 +7706,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, for_each_sibling_event(sub, leader) { n = 0; - if ((sub != event) && - (sub->state == PERF_EVENT_STATE_ACTIVE)) - sub->pmu->read(sub); + if ((sub != event) && !handle->skip_read) + perf_pmu_read(sub); values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) @@ -7520,6 +7766,9 @@ void perf_output_sample(struct perf_output_handle *handle, { u64 sample_type = data->type; + if (data->sample_flags & PERF_SAMPLE_READ) + handle->skip_read = 1; + perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) @@ -8321,7 +8570,8 @@ void perf_event_exec(void) perf_event_enable_on_exec(ctx); perf_event_remove_on_exec(ctx); - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); + scoped_guard(rcu) + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); perf_unpin_context(ctx); put_ctx(ctx); @@ -8513,10 +8763,58 @@ static void perf_event_task(struct task_struct *task, task_ctx); } +/* + * Allocate data for a new task when profiling system-wide + * events which require PMU specific data + */ +static void +perf_event_alloc_task_data(struct task_struct *child, + struct task_struct *parent) +{ + struct kmem_cache *ctx_cache = NULL; + struct perf_ctx_data *cd; + + if (!refcount_read(&global_ctx_data_ref)) + return; + + scoped_guard (rcu) { + cd = rcu_dereference(parent->perf_ctx_data); + if (cd) + ctx_cache = cd->ctx_cache; + } + + if (!ctx_cache) + return; + + guard(percpu_read)(&global_ctx_data_rwsem); + scoped_guard (rcu) { + cd = rcu_dereference(child->perf_ctx_data); + if (!cd) { + /* + * A system-wide event may be unaccount, + * when attaching the perf_ctx_data. + */ + if (!refcount_read(&global_ctx_data_ref)) + return; + goto attach; + } + + if (!cd->global) { + cd->global = 1; + refcount_inc(&cd->refcount); + } + } + + return; +attach: + attach_task_ctx_data(child, ctx_cache, true); +} + void perf_event_fork(struct task_struct *task) { perf_event_task(task, NULL, 1); perf_event_namespaces(task); + perf_event_alloc_task_data(task, current); } /* @@ -8580,7 +8878,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) unsigned int size; memset(comm, 0, sizeof(comm)); - strscpy(comm, comm_event->task->comm, sizeof(comm)); + strscpy(comm, comm_event->task->comm); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -9024,7 +9322,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } cpy_name: - strscpy(tmp, name, sizeof(tmp)); + strscpy(tmp, name); name = tmp; got_name: /* @@ -9448,7 +9746,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; - strscpy(name, sym, KSYM_NAME_LEN); + strscpy(name, sym); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; @@ -10831,6 +11129,9 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, void perf_event_free_bpf_prog(struct perf_event *event) { + if (!event->prog) + return; + if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; @@ -10929,6 +11230,17 @@ static void perf_addr_filters_splice(struct perf_event *event, free_filters_list(&list); } +static void perf_free_addr_filters(struct perf_event *event) +{ + /* + * Used during free paths, there is no concurrency. + */ + if (list_empty(&event->addr_filters.list)) + return; + + perf_addr_filters_splice(event, NULL); +} + /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. @@ -11367,8 +11679,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - hwc->hrtimer.function = perf_swevent_hrtimer; + hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); /* * Since hrtimers have a fixed rate, we can do a static freq->period @@ -11605,11 +11916,6 @@ static int perf_event_idx_default(struct perf_event *event) return 0; } -static void free_pmu_context(struct pmu *pmu) -{ - free_percpu(pmu->cpu_pmu_context); -} - /* * Let userspace know that this PMU supports address range filtering: */ @@ -11619,7 +11925,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return sysfs_emit(page, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -11630,7 +11936,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); + return sysfs_emit(page, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -11641,7 +11947,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); + return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); @@ -11672,7 +11978,7 @@ perf_event_mux_interval_ms_store(struct device *dev, cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_pmu_context *cpc; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); @@ -11815,62 +12121,107 @@ del_dev: free_dev: put_device(pmu->dev); + pmu->dev = NULL; goto out; } static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; -int perf_pmu_register(struct pmu *pmu, const char *name, int type) +static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new) { - int cpu, ret, max = PERF_TYPE_MAX; + void *tmp, *val = idr_find(idr, id); - mutex_lock(&pmus_lock); - ret = -ENOMEM; - pmu->pmu_disable_count = alloc_percpu(int); - if (!pmu->pmu_disable_count) - goto unlock; + if (val != old) + return false; - pmu->type = -1; - if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { - ret = -EINVAL; - goto free_pdc; + tmp = idr_replace(idr, new, id); + if (IS_ERR(tmp)) + return false; + + WARN_ON_ONCE(tmp != val); + return true; +} + +static void perf_pmu_free(struct pmu *pmu) +{ + if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); } - if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { - ret = -EINVAL; - goto free_pdc; + if (pmu->cpu_pmu_context) { + int cpu; + + for_each_possible_cpu(cpu) { + struct perf_cpu_pmu_context *cpc; + + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); + if (!cpc) + continue; + if (cpc->epc.embedded) { + /* refcount managed */ + put_pmu_ctx(&cpc->epc); + continue; + } + kfree(cpc); + } + free_percpu(pmu->cpu_pmu_context); } +} + +DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T)) + +int perf_pmu_register(struct pmu *_pmu, const char *name, int type) +{ + int cpu, max = PERF_TYPE_MAX; + + struct pmu *pmu __free(pmu_unregister) = _pmu; + guard(mutex)(&pmus_lock); + + if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) + return -EINVAL; + + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, + "Can not register a pmu with an invalid scope.\n")) + return -EINVAL; pmu->name = name; if (type >= 0) max = type; - ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); - if (ret < 0) - goto free_pdc; + CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL); + if (pmu_type.id < 0) + return pmu_type.id; - WARN_ON(type >= 0 && ret != type); + WARN_ON(type >= 0 && pmu_type.id != type); - type = ret; - pmu->type = type; + pmu->type = pmu_type.id; + atomic_set(&pmu->exclusive_cnt, 0); if (pmu_bus_running && !pmu->dev) { - ret = pmu_dev_alloc(pmu); + int ret = pmu_dev_alloc(pmu); if (ret) - goto free_idr; + return ret; } - ret = -ENOMEM; - pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); + pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *); if (!pmu->cpu_pmu_context) - goto free_dev; + return -ENOMEM; for_each_possible_cpu(cpu) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = + kmalloc_node(sizeof(struct perf_cpu_pmu_context), + GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); + + if (!cpc) + return -ENOMEM; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc; __perf_init_event_pmu_context(&cpc->epc, pmu); __perf_mux_hrtimer_init(cpc, cpu); } @@ -11903,33 +12254,25 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; + /* + * Now that the PMU is complete, make it visible to perf_try_init_event(). + */ + if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) + return -EINVAL; list_add_rcu(&pmu->entry, &pmus); - atomic_set(&pmu->exclusive_cnt, 0); - ret = 0; -unlock: - mutex_unlock(&pmus_lock); - - return ret; - -free_dev: - if (pmu->dev && pmu->dev != PMU_NULL_DEV) { - device_del(pmu->dev); - put_device(pmu->dev); - } -free_idr: - idr_remove(&pmu_idr, pmu->type); - -free_pdc: - free_percpu(pmu->pmu_disable_count); - goto unlock; + take_idr_id(pmu_type); + _pmu = no_free_ptr(pmu); // let it rip + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { - mutex_lock(&pmus_lock); - list_del_rcu(&pmu->entry); + scoped_guard (mutex, &pmus_lock) { + list_del_rcu(&pmu->entry); + idr_remove(&pmu_idr, pmu->type); + } /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -11938,16 +12281,7 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_srcu(&pmus_srcu); synchronize_rcu(); - free_percpu(pmu->pmu_disable_count); - idr_remove(&pmu_idr, pmu->type); - if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); - } - free_pmu_context(pmu); - mutex_unlock(&pmus_lock); + perf_pmu_free(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -11987,48 +12321,61 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); - if (!ret) { - if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && - has_extended_regs(event)) - ret = -EOPNOTSUPP; + if (ret) + goto err_pmu; - if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && - event_has_any_exclude_flag(event)) - ret = -EINVAL; + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && + has_extended_regs(event)) { + ret = -EOPNOTSUPP; + goto err_destroy; + } - if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { - const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); - struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); - int cpu; - - if (pmu_cpumask && cpumask) { - cpu = cpumask_any_and(pmu_cpumask, cpumask); - if (cpu >= nr_cpu_ids) - ret = -ENODEV; - else - event->event_caps |= PERF_EV_CAP_READ_SCOPE; - } else { - ret = -ENODEV; - } - } + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && + event_has_any_exclude_flag(event)) { + ret = -EINVAL; + goto err_destroy; + } + + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask; + struct cpumask *pmu_cpumask; + int cpu; + + cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + pmu_cpumask = perf_scope_cpumask(pmu->scope); + + ret = -ENODEV; + if (!pmu_cpumask || !cpumask) + goto err_destroy; - if (ret && event->destroy) - event->destroy(event); + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + goto err_destroy; + + event->event_caps |= PERF_EV_CAP_READ_SCOPE; } - if (ret) - module_put(pmu->module); + return 0; +err_destroy: + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } + +err_pmu: + event->pmu = NULL; + module_put(pmu->module); return ret; } static struct pmu *perf_init_event(struct perf_event *event) { bool extended_type = false; - int idx, type, ret; struct pmu *pmu; + int type, ret; - idx = srcu_read_lock(&pmus_srcu); + guard(srcu)(&pmus_srcu); /* * Save original type before calling pmu->event_init() since certain @@ -12041,7 +12388,7 @@ static struct pmu *perf_init_event(struct perf_event *event) pmu = event->parent->pmu; ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; } /* @@ -12060,13 +12407,12 @@ static struct pmu *perf_init_event(struct perf_event *event) } again: - rcu_read_lock(); - pmu = idr_find(&pmu_idr, type); - rcu_read_unlock(); + scoped_guard (rcu) + pmu = idr_find(&pmu_idr, type); if (pmu) { if (event->attr.type != type && type != PERF_TYPE_RAW && !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) - goto fail; + return ERR_PTR(-ENOENT); ret = perf_try_init_event(pmu, event); if (ret == -ENOENT && event->attr.type != type && !extended_type) { @@ -12075,27 +12421,21 @@ again: } if (ret) - pmu = ERR_PTR(ret); + return ERR_PTR(ret); - goto unlock; + return pmu; } list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; - if (ret != -ENOENT) { - pmu = ERR_PTR(ret); - goto unlock; - } + if (ret != -ENOENT) + return ERR_PTR(ret); } -fail: - pmu = ERR_PTR(-ENOENT); -unlock: - srcu_read_unlock(&pmus_srcu, idx); - return pmu; + return ERR_PTR(-ENOENT); } static void attach_sb_event(struct perf_event *event) @@ -12222,7 +12562,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, void *context, int cgroup_fd) { struct pmu *pmu; - struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; int node; @@ -12237,8 +12576,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } node = (cpu >= 0) ? cpu_to_node(cpu) : -1; - event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, - node); + struct perf_event *event __free(__free_event) = + kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node); if (!event) return ERR_PTR(-ENOMEM); @@ -12345,15 +12684,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * See perf_output_read(). */ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) - goto err_ns; + return ERR_PTR(-EINVAL); if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; pmu = perf_init_event(event); - if (IS_ERR(pmu)) { - err = PTR_ERR(pmu); - goto err_ns; + if (IS_ERR(pmu)) + return (void*)pmu; + + /* + * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config(). + * The attach should be right after the perf_init_event(). + * Otherwise, the __free_event() would mistakenly detach the non-exist + * perf_ctx_data because of the other errors between them. + */ + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + err = attach_perf_ctx_data(event); + if (err) + return ERR_PTR(err); } /* @@ -12361,49 +12710,39 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * events (they don't make sense as the cgroup will be different * on other CPUs in the uncore mask). */ - if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { - err = -EINVAL; - goto err_pmu; - } + if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) + return ERR_PTR(-EINVAL); if (event->attr.aux_output && (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || - event->attr.aux_pause || event->attr.aux_resume)) { - err = -EOPNOTSUPP; - goto err_pmu; - } + event->attr.aux_pause || event->attr.aux_resume)) + return ERR_PTR(-EOPNOTSUPP); - if (event->attr.aux_pause && event->attr.aux_resume) { - err = -EINVAL; - goto err_pmu; - } + if (event->attr.aux_pause && event->attr.aux_resume) + return ERR_PTR(-EINVAL); if (event->attr.aux_start_paused) { - if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { - err = -EOPNOTSUPP; - goto err_pmu; - } + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return ERR_PTR(-EOPNOTSUPP); event->hw.aux_paused = 1; } if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) - goto err_pmu; + return ERR_PTR(err); } err = exclusive_event_init(event); if (err) - goto err_pmu; + return ERR_PTR(err); if (has_addr_filter(event)) { event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, sizeof(struct perf_addr_filter_range), GFP_KERNEL); - if (!event->addr_filter_ranges) { - err = -ENOMEM; - goto err_per_task; - } + if (!event->addr_filter_ranges) + return ERR_PTR(-ENOMEM); /* * Clone the parent's vma offsets: they are valid until exec() @@ -12427,42 +12766,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) - goto err_addr_filters; + return ERR_PTR(err); + event->attach_state |= PERF_ATTACH_CALLCHAIN; } } err = security_perf_event_alloc(event); if (err) - goto err_callchain_buffer; + return ERR_PTR(err); /* symmetric to unaccount_event() in _free_event() */ account_event(event); - return event; - -err_callchain_buffer: - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } -err_addr_filters: - kfree(event->addr_filter_ranges); - -err_per_task: - exclusive_event_destroy(event); - -err_pmu: - if (is_cgroup_event(event)) - perf_detach_cgroup(event); - if (event->destroy) - event->destroy(event); - module_put(pmu->module); -err_ns: - if (event->hw.target) - put_task_struct(event->hw.target); - call_rcu(&event->rcu_head, free_event_rcu); - - return ERR_PTR(err); + return_ptr(event); } static int perf_copy_attr(struct perf_event_attr __user *uattr, @@ -12532,7 +12848,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, } /* privileged levels capture (kernel, hv): check permissions */ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { - ret = perf_allow_kernel(attr); + ret = perf_allow_kernel(); if (ret) return ret; } @@ -12789,12 +13105,12 @@ SYSCALL_DEFINE5(perf_event_open, return err; /* Do we allow access to perf_event_open(2) ? */ - err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + err = security_perf_event_open(PERF_SECURITY_OPEN); if (err) return err; if (!attr.exclude_kernel) { - err = perf_allow_kernel(&attr); + err = perf_allow_kernel(); if (err) return err; } @@ -12814,7 +13130,7 @@ SYSCALL_DEFINE5(perf_event_open, /* Only privileged users can get physical addresses */ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { - err = perf_allow_kernel(&attr); + err = perf_allow_kernel(); if (err) return err; } @@ -13536,6 +13852,12 @@ void perf_event_exit_task(struct task_struct *child) * At this point we need to send EXIT events to cpu contexts. */ perf_event_task(child, NULL, 0); + + /* + * Detach the perf_ctx_data for the system-wide event. + */ + guard(percpu_read)(&global_ctx_data_rwsem); + detach_task_ctx_data(child); } static void perf_free_event(struct perf_event *event, @@ -13647,12 +13969,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) return &event->attr; } -int perf_allow_kernel(struct perf_event_attr *attr) +int perf_allow_kernel(void) { if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; - return security_perf_event_open(attr, PERF_SECURITY_KERNEL); + return security_perf_event_open(PERF_SECURITY_KERNEL); } EXPORT_SYMBOL_GPL(perf_allow_kernel); @@ -13711,7 +14033,6 @@ inherit_event(struct perf_event *parent_event, if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); - /* task_ctx_data is freed with child_ctx */ free_event(child_event); return NULL; } @@ -13969,6 +14290,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags) child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); + child->perf_ctx_data = NULL; ret = perf_event_init_context(child, clone_flags); if (ret) { diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bc4a61029b6d..8ec2cb688903 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -950,9 +950,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp) return -ENOENT; /* - * no branch sampling for breakpoint events + * Check if breakpoint type is supported before proceeding. + * Also, no branch sampling for breakpoint events. */ - if (has_branch_stack(bp)) + if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp)) return -EOPNOTSUPP; err = register_perf_hw_breakpoint(bp); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 180509132d4b..5130b119d0ae 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -19,7 +19,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->rb->poll, EPOLLIN); + atomic_set(&handle->rb->poll, EPOLLIN | EPOLLRDNORM); handle->event->pending_wakeup = 1; @@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; + handle->flags = 0; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2ca797cbe465..70c84b9d7be3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -417,7 +417,7 @@ static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " - "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) uprobe->ref_ctr_offset, mm); @@ -495,6 +495,11 @@ retry: if (ret <= 0) goto put_old; + if (is_zero_page(old_page)) { + ret = -EINVAL; + goto put_old; + } + if (WARN(!is_register && PageCompound(old_page), "uprobe unregister should never work on compound page\n")) { ret = -EINVAL; @@ -762,10 +767,14 @@ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) enum hprobe_state hstate; /* - * return_instance's hprobe is protected by RCU. - * Underlying uprobe is itself protected from reuse by SRCU. + * Caller should guarantee that return_instance is not going to be + * freed from under us. This can be achieved either through holding + * rcu_read_lock() or by owning return_instance in the first place. + * + * Underlying uprobe is itself protected from reuse by SRCU, so ensure + * SRCU lock is held properly. */ - lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu)); + lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); hstate = READ_ONCE(hprobe->state); switch (hstate) { @@ -2160,8 +2169,8 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) */ unsigned long uprobe_get_trampoline_vaddr(void) { + unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR; struct xol_area *area; - unsigned long trampoline_vaddr = -1; /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ @@ -2302,9 +2311,8 @@ bool uprobe_deny_signal(void) WARN_ON_ONCE(utask->state != UTASK_SSTEP); if (task_sigpending(t)) { - spin_lock_irq(&t->sighand->siglock); + utask->signal_denied = true; clear_tsk_thread_flag(t, TIF_SIGPENDING); - spin_unlock_irq(&t->sighand->siglock); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; @@ -2737,9 +2745,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) utask->state = UTASK_RUNNING; xol_free_insn_slot(utask); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* see uprobe_deny_signal() */ - spin_unlock_irq(¤t->sighand->siglock); + if (utask->signal_denied) { + set_thread_flag(TIF_SIGPENDING); + utask->signal_denied = false; + } if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); diff --git a/kernel/exit.c b/kernel/exit.c index 3485e5fc499e..c2e6c7b7779f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -69,6 +69,7 @@ #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/uaccess.h> +#include <linux/pidfs.h> #include <uapi/linux/wait.h> @@ -122,14 +123,22 @@ static __init int kernel_exit_sysfs_init(void) late_initcall(kernel_exit_sysfs_init); #endif -static void __unhash_process(struct task_struct *p, bool group_dead) +/* + * For things release_task() would like to do *after* tasklist_lock is released. + */ +struct release_task_post { + struct pid *pids[PIDTYPE_MAX]; +}; + +static void __unhash_process(struct release_task_post *post, struct task_struct *p, + bool group_dead) { nr_threads--; - detach_pid(p, PIDTYPE_PID); + detach_pid(post->pids, p, PIDTYPE_PID); if (group_dead) { - detach_pid(p, PIDTYPE_TGID); - detach_pid(p, PIDTYPE_PGID); - detach_pid(p, PIDTYPE_SID); + detach_pid(post->pids, p, PIDTYPE_TGID); + detach_pid(post->pids, p, PIDTYPE_PGID); + detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); @@ -141,7 +150,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) /* * This function expects the tasklist_lock write-locked. */ -static void __exit_signal(struct task_struct *tsk) +static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); @@ -174,9 +183,6 @@ static void __exit_signal(struct task_struct *tsk) sig->curr_target = next_thread(tsk); } - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, - sizeof(unsigned long long)); - /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, @@ -197,23 +203,15 @@ static void __exit_signal(struct task_struct *tsk) task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; - __unhash_process(tsk, group_dead); + __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); - /* - * Do this under ->siglock, we can race with another thread - * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. - */ - flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk, TIF_SIGPENDING); - if (group_dead) { - flush_sigqueue(&sig->shared_pending); + if (group_dead) tty_kref_put(tty); - } } static void delayed_put_task_struct(struct rcu_head *rhp) @@ -239,22 +237,27 @@ void __weak release_thread(struct task_struct *dead_task) void release_task(struct task_struct *p) { + struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: + memset(&post, 0, sizeof(post)); + /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); + pidfs_exit(p); cgroup_release(p); + thread_pid = get_pid(p->thread_pid); + write_lock_irq(&tasklist_lock); ptrace_release_task(p); - thread_pid = get_pid(p->thread_pid); - __exit_signal(p); + __exit_signal(&post, p); /* * If we are the last non-leader member of the thread @@ -278,7 +281,20 @@ repeat: write_unlock_irq(&tasklist_lock); proc_flush_pid(thread_pid); put_pid(thread_pid); + add_device_randomness(&p->se.sum_exec_runtime, + sizeof(p->se.sum_exec_runtime)); + free_pids(post.pids); release_thread(p); + /* + * This task was already removed from the process/thread/pid lists + * and lock_task_sighand(p) can't succeed. Nobody else can touch + * ->pending or, if group dead, signal->shared_pending. We can call + * flush_sigqueue() lockless. + */ + flush_sigqueue(&p->pending); + if (thread_group_leader(p)) + flush_sigqueue(&p->signal->shared_pending); + put_task_struct_rcu_user(p); p = leader; @@ -741,10 +757,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->exit_state = EXIT_ZOMBIE; /* - * sub-thread or delay_group_leader(), wake up the - * PIDFD_THREAD waiters. + * Ignore thread-group leaders that exited before all + * subthreads did. */ - if (!thread_group_empty(tsk)) + if (!delay_group_leader(tsk)) do_notify_pidfd(tsk); if (unlikely(tsk->ptrace)) { diff --git a/kernel/fork.c b/kernel/fork.c index 735405a9c5f3..1b659b07ecd5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -504,6 +504,10 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) vma_numab_state_init(new); dup_anon_vma_name(orig, new); + /* track_pfn_copy() will later take care of copying internal state. */ + if (unlikely(new->vm_flags & VM_PFNMAP)) + untrack_pfn_clear(new); + return new; } @@ -1891,8 +1895,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) #ifdef CONFIG_POSIX_TIMERS INIT_HLIST_HEAD(&sig->posix_timers); INIT_HLIST_HEAD(&sig->ignored_posix_timers); - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->real_timer.function = it_real_fn; + hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); #endif task_lock(current->group_leader); @@ -2032,25 +2035,18 @@ static inline void rcu_copy_process(struct task_struct *p) */ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - int pidfd; struct file *pidfd_file; - pidfd = get_unused_fd_flags(O_CLOEXEC); + CLASS(get_unused_fd, pidfd)(O_CLOEXEC); if (pidfd < 0) return pidfd; pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); - if (IS_ERR(pidfd_file)) { - put_unused_fd(pidfd); + if (IS_ERR(pidfd_file)) return PTR_ERR(pidfd_file); - } - /* - * anon_inode_getfile() ignores everything outside of the - * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually. - */ - pidfd_file->f_flags |= (flags & PIDFD_THREAD); + *ret = pidfd_file; - return pidfd; + return take_fd(pidfd); } /** @@ -2432,8 +2428,11 @@ __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; - /* Note that no task has been attached to @pid yet. */ - retval = __pidfd_prepare(pid, flags, &pidfile); + /* + * Note that no task has been attached to @pid yet indicate + * that via CLONE_PIDFD. + */ + retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; diff --git a/kernel/futex/core.c b/kernel/futex/core.c index ebdd76b4ecbb..cca15859a50b 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -50,10 +50,10 @@ */ static struct { struct futex_hash_bucket *queues; - unsigned long hashsize; + unsigned long hashmask; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_queues (__futex_data.queues) -#define futex_hashsize (__futex_data.hashsize) +#define futex_hashmask (__futex_data.hashmask) /* @@ -119,7 +119,7 @@ struct futex_hash_bucket *futex_hash(union futex_key *key) u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); - return &futex_queues[hash & (futex_hashsize - 1)]; + return &futex_queues[hash & futex_hashmask]; } @@ -532,7 +532,8 @@ void futex_q_unlock(struct futex_hash_bucket *hb) futex_hb_waiters_dec(hb); } -void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) +void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, + struct task_struct *task) { int prio; @@ -548,7 +549,7 @@ void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); - q->task = current; + q->task = task; } /** @@ -1126,27 +1127,28 @@ void futex_exit_release(struct task_struct *tsk) static int __init futex_init(void) { + unsigned long hashsize, i; unsigned int futex_shift; - unsigned long i; #ifdef CONFIG_BASE_SMALL - futex_hashsize = 16; + hashsize = 16; #else - futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + hashsize = roundup_pow_of_two(256 * num_possible_cpus()); #endif futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, 0, + hashsize, 0, 0, &futex_shift, NULL, - futex_hashsize, futex_hashsize); - futex_hashsize = 1UL << futex_shift; + hashsize, hashsize); + hashsize = 1UL << futex_shift; - for (i = 0; i < futex_hashsize; i++) { + for (i = 0; i < hashsize; i++) { atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } + futex_hashmask = hashsize - 1; return 0; } core_initcall(futex_init); diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 99b32e728c4a..6b2f4c7eb720 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -285,13 +285,15 @@ static inline int futex_get_value_locked(u32 *dest, u32 __user *from) } extern void __futex_unqueue(struct futex_q *q); -extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb); +extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, + struct task_struct *task); extern int futex_unqueue(struct futex_q *q); /** * futex_queue() - Enqueue the futex_q on the futex_hash_bucket * @q: The futex_q to enqueue * @hb: The destination hash bucket + * @task: Task queueing this futex * * The hb->lock must be held by the caller, and is released here. A call to * futex_queue() is typically paired with exactly one call to futex_unqueue(). The @@ -299,11 +301,14 @@ extern int futex_unqueue(struct futex_q *q); * or nothing if the unqueue is done as part of the wake process and the unqueue * state is implicit in the state of woken task (see futex_wait_requeue_pi() for * an example). + * + * Note that @task may be NULL, for async usage of futexes. */ -static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) +static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, + struct task_struct *task) __releases(&hb->lock) { - __futex_queue(q, hb); + __futex_queue(q, hb, task); spin_unlock(&hb->lock); } diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index daea650b16f5..7a941845f7ee 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -982,7 +982,7 @@ retry_private: /* * Only actually queue now that the atomic ops are done: */ - __futex_queue(&q, hb); + __futex_queue(&q, hb, current); if (trylock) { ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index eb86a7ade06a..25877d4f2f8f 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -349,7 +349,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * access to the hash list and forcing another memory barrier. */ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); - futex_queue(q, hb); + futex_queue(q, hb, current); /* Arm the timer */ if (timeout) @@ -460,7 +460,7 @@ retry: * next futex. Queue each futex at this moment so hb can * be unlocked. */ - futex_queue(q, hb); + futex_queue(q, hb, current); continue; } diff --git a/kernel/iomem.c b/kernel/iomem.c index dc2120776e1c..75e61c1c6bc0 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -6,7 +6,8 @@ #include <linux/ioremap.h> #ifndef arch_memremap_wb -static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +static void *arch_memremap_wb(resource_size_t offset, unsigned long size, + unsigned long flags) { #ifdef ioremap_cache return (__force void *)ioremap_cache(offset, size); @@ -91,7 +92,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size, flags); if (!addr) - addr = arch_memremap_wb(offset, size); + addr = arch_memremap_wb(offset, size, flags); } /* diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5432418c0fea..3f02a0e45254 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -31,10 +31,6 @@ config GENERIC_IRQ_EFFECTIVE_AFF_MASK config GENERIC_PENDING_IRQ bool -# Deduce delayed migration from top-level interrupt chip flags -config GENERIC_PENDING_IRQ_CHIPFLAGS - bool - # Support for generic irq migrating off cpu before the cpu is offline. config GENERIC_IRQ_MIGRATION bool @@ -51,10 +47,6 @@ config GENERIC_IRQ_INJECTION config HARDIRQS_SW_RESEND bool -# Edge style eoi based handler (cell) -config IRQ_EDGE_EOI_HANDLER - bool - # Generic configurable interrupt chip implementation config GENERIC_IRQ_CHIP bool @@ -100,6 +92,7 @@ config GENERIC_MSI_IRQ bool select IRQ_DOMAIN_HIERARCHY +# irqchip drivers should select this if they call iommu_dma_prepare_msi() config IRQ_MSI_IOMMU bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c901436ebd9f..36cf1b09cc84 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -232,6 +232,21 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, } #endif +static void irq_enable(struct irq_desc *desc) +{ + if (!irqd_irq_disabled(&desc->irq_data)) { + unmask_irq(desc); + } else { + irq_state_clr_disabled(desc); + if (desc->irq_data.chip->irq_enable) { + desc->irq_data.chip->irq_enable(&desc->irq_data); + irq_state_clr_masked(desc); + } else { + unmask_irq(desc); + } + } +} + static int __irq_startup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); @@ -332,21 +347,6 @@ void irq_shutdown_and_deactivate(struct irq_desc *desc) irq_domain_deactivate_irq(&desc->irq_data); } -void irq_enable(struct irq_desc *desc) -{ - if (!irqd_irq_disabled(&desc->irq_data)) { - unmask_irq(desc); - } else { - irq_state_clr_disabled(desc); - if (desc->irq_data.chip->irq_enable) { - desc->irq_data.chip->irq_enable(&desc->irq_data); - irq_state_clr_masked(desc); - } else { - unmask_irq(desc); - } - } -} - static void __irq_disable(struct irq_desc *desc, bool mask) { if (irqd_irq_disabled(&desc->irq_data)) { @@ -838,53 +838,6 @@ out_unlock: } EXPORT_SYMBOL(handle_edge_irq); -#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER -/** - * handle_edge_eoi_irq - edge eoi type IRQ handler - * @desc: the interrupt description structure for this irq - * - * Similar as the above handle_edge_irq, but using eoi and w/o the - * mask/unmask logic. - */ -void handle_edge_eoi_irq(struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - - raw_spin_lock(&desc->lock); - - desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - - if (!irq_may_run(desc)) { - desc->istate |= IRQS_PENDING; - goto out_eoi; - } - - /* - * If its disabled or no action available then mask it and get - * out of here. - */ - if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { - desc->istate |= IRQS_PENDING; - goto out_eoi; - } - - kstat_incr_irqs_this_cpu(desc); - - do { - if (unlikely(!desc->action)) - goto out_eoi; - - handle_irq_event(desc); - - } while ((desc->istate & IRQS_PENDING) && - !irqd_irq_disabled(&desc->irq_data)); - -out_eoi: - chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); -} -#endif - /** * handle_percpu_irq - Per CPU local irq handler * @desc: the interrupt description structure for this irq diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a979523640d0..b0290849c395 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -90,7 +90,6 @@ extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); extern void irq_shutdown_and_deactivate(struct irq_desc *desc); -extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); @@ -98,18 +97,12 @@ extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); -extern unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask); - #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } #else extern void irq_mark_irq(unsigned int irq); #endif -extern int __irq_get_irqchip_state(struct irq_data *data, - enum irqchip_irq_state which, - bool *state); - irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); @@ -139,8 +132,6 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern void irq_set_thread_affinity(struct irq_desc *desc); - extern int irq_do_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force); @@ -442,6 +433,7 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) return desc->pending_mask; } bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); +void irq_force_complete_move(struct irq_desc *desc); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) { @@ -467,6 +459,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) { return false; } +static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 287830739783..4258cd6bd3b4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -991,7 +991,7 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } -unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) +static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { unsigned int sum = 0; int cpu; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index ec6d8e72d980..2861f89880af 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1589,9 +1589,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, } } -int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg) +static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, + unsigned int nr_irqs, void *arg) { if (!domain->ops->alloc) { pr_debug("domain->ops->alloc() is NULL\n"); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f300bb6be3bd..753eef8e041c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -35,6 +35,8 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif +static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); + static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { struct irq_data *irqd = irq_desc_get_irq_data(desc); @@ -187,7 +189,7 @@ bool irq_can_set_affinity_usr(unsigned int irq) * set_cpus_allowed_ptr() here as we hold desc->lock and this * code can be called from hard interrupt context. */ -void irq_set_thread_affinity(struct irq_desc *desc) +static void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; @@ -2789,8 +2791,7 @@ out: irq_put_desc_unlock(desc, flags); } -int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, - bool *state) +static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct irq_chip *chip; int err = -EINVAL; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index eb150afd671f..147cabb4c077 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -35,6 +35,16 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) return true; } +void irq_force_complete_move(struct irq_desc *desc) +{ + for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = d->parent_data) { + if (d->chip && d->chip->irq_force_complete_move) { + d->chip->irq_force_complete_move(d); + return; + } + } +} + void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); @@ -117,3 +127,13 @@ void __irq_move_irq(struct irq_data *idata) if (!masked) idata->chip->irq_unmask(idata); } + +bool irq_can_move_in_process_context(struct irq_data *data) +{ + /* + * Get the top level irq_data in the hierarchy, which is optimized + * away when CONFIG_IRQ_DOMAIN_HIERARCHY is disabled. + */ + data = irq_desc_get_irq_data(irq_data_to_desc(data)); + return irq_can_move_pcntxt(data); +} diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 396a067a8a56..5c8d43cdb0a3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -15,6 +15,7 @@ #include <linux/mutex.h> #include <linux/pci.h> #include <linux/slab.h> +#include <linux/seq_file.h> #include <linux/sysfs.h> #include <linux/types.h> #include <linux/xarray.h> @@ -756,12 +757,30 @@ static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fw return info->ops->msi_translate(domain, fwspec, hwirq, type); } +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) +{ + struct msi_desc *desc = irq_data_get_msi_desc(irqd); + + if (!desc) + return; + + seq_printf(m, "\n%*saddress_hi: 0x%08x", ind + 1, "", desc->msg.address_hi); + seq_printf(m, "\n%*saddress_lo: 0x%08x", ind + 1, "", desc->msg.address_lo); + seq_printf(m, "\n%*smsg_data: 0x%08x\n", ind + 1, "", desc->msg.data); +} +#endif + static const struct irq_domain_ops msi_domain_ops = { .alloc = msi_domain_alloc, .free = msi_domain_free, .activate = msi_domain_activate, .deactivate = msi_domain_deactivate, .translate = msi_domain_translate, +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + .debug_show = msi_domain_debug_show, +#endif }; static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, @@ -1143,7 +1162,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) return false; - if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) + if (info->flags & MSI_FLAG_NO_MASK) return false; /* diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 93a822d3c468..7cb19e601426 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -653,13 +653,12 @@ static int __jump_label_mod_text_reserved(void *start, void *end) struct module *mod; int ret; - preempt_disable(); - mod = __module_text_address((unsigned long)start); - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); - if (!try_module_get(mod)) - mod = NULL; - preempt_enable(); - + scoped_guard(rcu) { + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + } if (!mod) return 0; @@ -746,9 +745,9 @@ static int jump_label_add_module(struct module *mod) kfree(jlm); return -ENOMEM; } - preempt_disable(); - jlm2->mod = __module_address((unsigned long)key); - preempt_enable(); + scoped_guard(rcu) + jlm2->mod = __module_address((unsigned long)key); + jlm2->entries = static_key_entries(key); jlm2->next = NULL; static_key_set_mod(key, jlm2); @@ -906,13 +905,13 @@ static void jump_label_update(struct static_key *key) return; } - preempt_disable(); - mod = __module_address((unsigned long)key); - if (mod) { - stop = mod->jump_entries + mod->num_jump_entries; - init = mod->state == MODULE_STATE_COMING; + scoped_guard(rcu) { + mod = __module_address((unsigned long)key); + if (mod) { + stop = mod->jump_entries + mod->num_jump_entries; + init = mod->state == MODULE_STATE_COMING; + } } - preempt_enable(); #endif entry = static_key_entries(key); /* if there are no users, entry can be NULL */ diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index a9a0ca605d4a..4198f30aac3c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -148,16 +148,8 @@ static unsigned int get_symbol_offset(unsigned long pos) unsigned long kallsyms_sym_address(int idx) { - /* values are unsigned offsets if --absolute-percpu is not in effect */ - if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) - return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; - - /* ...otherwise, positive offsets are absolute values */ - if (kallsyms_offsets[idx] >= 0) - return kallsyms_offsets[idx]; - - /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ - return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; + /* values are unsigned offsets */ + return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; } static unsigned int get_symbol_seq(int index) diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 2c596851f8a9..7c1a65bd5f8d 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -145,7 +145,7 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, */ task1 = find_task_by_vpid(pid1); task2 = find_task_by_vpid(pid2); - if (!task1 || !task2) + if (unlikely(!task1 || !task2)) goto err_no_task; get_task_struct(task1); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c0bdc1686154..c22ad51c4317 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1013,7 +1013,7 @@ int kernel_kexec(void) error = -EBUSY; goto Restore_console; } - suspend_console(); + console_suspend_all(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; @@ -1072,7 +1072,7 @@ int kernel_kexec(void) Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: - resume_console(); + console_resume_all(); thaw_processes(); Restore_console: pm_restore_console(); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 88aeac84e4c0..ffe0c3d52306 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1547,7 +1547,7 @@ static int check_kprobe_address_safe(struct kprobe *p, /* Ensure the address is in a text area, and find a module if exists. */ *probed_mod = NULL; if (!core_kernel_text((unsigned long) p->addr)) { - guard(preempt)(); + guard(rcu)(); *probed_mod = __module_text_address((unsigned long) p->addr); if (!(*probed_mod)) return -EINVAL; diff --git a/kernel/kthread.c b/kernel/kthread.c index 4005b13ebd7f..5dc5b0d7238e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -859,7 +859,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; unsigned long flags; - int ret; + int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { WARN_ON(1); @@ -892,7 +892,7 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) out: free_cpumask_var(affinity); - return 0; + return ret; } /* diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 0cd39954d5a1..0e73fac55f8e 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -59,7 +59,7 @@ static void klp_find_object_module(struct klp_object *obj) if (!klp_is_module(obj)) return; - rcu_read_lock_sched(); + guard(rcu)(); /* * We do not want to block removal of patched modules and therefore * we do not take a reference here. The patches are removed by @@ -75,8 +75,6 @@ static void klp_find_object_module(struct klp_object *obj) */ if (mod && mod->klp_alive) obj->mod = mod; - - rcu_read_unlock_sched(); } static bool klp_initialized(void) @@ -601,9 +599,12 @@ static int klp_add_object_nops(struct klp_patch *patch, } /* - * Add 'nop' functions which simply return to the caller to run - * the original function. The 'nop' functions are added to a - * patch to facilitate a 'replace' mode. + * Add 'nop' functions which simply return to the caller to run the + * original function. + * + * They are added only when the atomic replace mode is used and only for + * functions which are currently livepatched but are no longer included + * in the new livepatch. */ static int klp_add_nops(struct klp_patch *patch) { diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 0db4093d17b8..a114949eeed5 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,7 +5,8 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance. +KASAN_SANITIZE_lockdep.o := n KCSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 97fb6f3f840a..4e36258cc34f 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -50,6 +50,11 @@ LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */ #endif /* CONFIG_QUEUED_SPINLOCKS */ /* + * Locking events for Resilient Queued Spin Lock + */ +LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */ + +/* * Locking events for rwsem */ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ @@ -67,3 +72,31 @@ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ + +/* + * Locking events for rtlock_slowlock() + */ +LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */ +LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */ +LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */ + +/* + * Locking events for rt_mutex_slowlock() + */ +LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */ +LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */ +LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */ +LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */ +LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */ +LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */ + +/* + * Locking events for lockdep + */ +LOCK_EVENT(lockdep_acquire) +LOCK_EVENT(lockdep_lock) +LOCK_EVENT(lockdep_nocheck) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4470680f0226..58d78a33ac65 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -57,10 +57,12 @@ #include <linux/lockdep.h> #include <linux/context_tracking.h> #include <linux/console.h> +#include <linux/kasan.h> #include <asm/sections.h> #include "lockdep_internals.h" +#include "lock_events.h" #include <trace/events/lock.h> @@ -170,6 +172,7 @@ static struct task_struct *lockdep_selftest_task_struct; static int graph_lock(void) { lockdep_lock(); + lockevent_inc(lockdep_lock); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -5091,8 +5094,12 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (unlikely(lock->key == &__lockdep_no_track__)) return 0; - if (!prove_locking || lock->key == &__lockdep_no_validate__) + lockevent_inc(lockdep_acquire); + + if (!prove_locking || lock->key == &__lockdep_no_validate__) { check = 0; + lockevent_inc(lockdep_nocheck); + } if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -5824,6 +5831,14 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!debug_locks) return; + /* + * As KASAN instrumentation is disabled and lock_acquire() is usually + * the first lockdep call when a task tries to acquire a lock, add + * kasan_check_byte() here to check for use-after-free and other + * memory errors. + */ + kasan_check_byte(lock); + if (unlikely(!lockdep_enabled())) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { @@ -6249,6 +6264,9 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) hlist_del_rcu(&class->hash_entry); WRITE_ONCE(class->key, NULL); WRITE_ONCE(class->name, NULL); + /* Class allocated but not used, -1 in nr_unused_locks */ + if (class->usage_mask == 0) + debug_atomic_dec(nr_unused_locks); nr_lock_classes--; __clear_bit(class - lock_classes, lock_classes_in_use); if (class - lock_classes == max_lock_class_idx) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index cc33470f4de9..ce0362f0a871 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -362,6 +362,60 @@ static struct lock_torture_ops raw_spin_lock_irq_ops = { .name = "raw_spin_lock_irq" }; +#ifdef CONFIG_BPF_SYSCALL + +#include <asm/rqspinlock.h> +static rqspinlock_t rqspinlock; + +static int torture_raw_res_spin_write_lock(int tid __maybe_unused) +{ + raw_res_spin_lock(&rqspinlock); + return 0; +} + +static void torture_raw_res_spin_write_unlock(int tid __maybe_unused) +{ + raw_res_spin_unlock(&rqspinlock); +} + +static struct lock_torture_ops raw_res_spin_lock_ops = { + .writelock = torture_raw_res_spin_write_lock, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_res_spin_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_res_spin_lock" +}; + +static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused) +{ + unsigned long flags; + + raw_res_spin_lock_irqsave(&rqspinlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused) +{ + raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops raw_res_spin_lock_irq_ops = { + .writelock = torture_raw_res_spin_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_rt_boost, + .writeunlock = torture_raw_res_spin_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "raw_res_spin_lock_irq" +}; + +#endif + static DEFINE_RWLOCK(torture_rwlock); static int torture_rwlock_write_lock(int tid __maybe_unused) @@ -1168,6 +1222,9 @@ static int __init lock_torture_init(void) &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, &raw_spin_lock_ops, &raw_spin_lock_irq_ops, +#ifdef CONFIG_BPF_SYSCALL + &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops, +#endif &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, &ww_mutex_lock_ops, diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 85251d8771d9..5c92ba199b90 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -15,12 +15,6 @@ #include <asm/mcs_spinlock.h> -struct mcs_spinlock { - struct mcs_spinlock *next; - int locked; /* 1 if lock acquired */ - int count; /* nesting count, see qspinlock.c */ -}; - #ifndef arch_mcs_spin_lock_contended /* * Using smp_cond_load_acquire() provides the acquire semantics @@ -30,9 +24,7 @@ struct mcs_spinlock { * spinning, and smp_cond_load_acquire() provides that behavior. */ #define arch_mcs_spin_lock_contended(l) \ -do { \ - smp_cond_load_acquire(l, VAL); \ -} while (0) + smp_cond_load_acquire(l, VAL) #endif #ifndef arch_mcs_spin_unlock_contended diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b36f23de48f1..19b636f60a24 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -143,6 +143,8 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) unsigned long curr = (unsigned long)current; unsigned long zero = 0UL; + MUTEX_WARN_ON(lock->magic != lock); + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr)) return true; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 7d96bed718e4..af8d122bb649 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -25,8 +25,9 @@ #include <trace/events/lock.h> /* - * Include queued spinlock statistics code + * Include queued spinlock definitions and statistics code */ +#include "qspinlock.h" #include "qspinlock_stat.h" /* @@ -67,36 +68,6 @@ */ #include "mcs_spinlock.h" -#define MAX_NODES 4 - -/* - * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in - * size and four of them will fit nicely in one 64-byte cacheline. For - * pvqspinlock, however, we need more space for extra data. To accommodate - * that, we insert two more long words to pad it up to 32 bytes. IOW, only - * two of them can fit in a cacheline in this case. That is OK as it is rare - * to have more than 2 levels of slowpath nesting in actual use. We don't - * want to penalize pvqspinlocks to optimize for a rare case in native - * qspinlocks. - */ -struct qnode { - struct mcs_spinlock mcs; -#ifdef CONFIG_PARAVIRT_SPINLOCKS - long reserved[2]; -#endif -}; - -/* - * The pending bit spinning loop count. - * This heuristic is used to limit the number of lockword accesses - * made by atomic_cond_read_relaxed when waiting for the lock to - * transition out of the "== _Q_PENDING_VAL" state. We don't spin - * indefinitely because there's no guarantee that we'll make forward - * progress. - */ -#ifndef _Q_PENDING_LOOPS -#define _Q_PENDING_LOOPS 1 -#endif /* * Per-CPU queue node structures; we can never have more than 4 nested @@ -106,161 +77,7 @@ struct qnode { * * PV doubles the storage and uses the second cacheline for PV state. */ -static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); - -/* - * We must be able to distinguish between no-tail and the tail at 0:0, - * therefore increment the cpu number by one. - */ - -static inline __pure u32 encode_tail(int cpu, int idx) -{ - u32 tail; - - tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; - tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ - - return tail; -} - -static inline __pure struct mcs_spinlock *decode_tail(u32 tail) -{ - int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; - int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; - - return per_cpu_ptr(&qnodes[idx].mcs, cpu); -} - -static inline __pure -struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) -{ - return &((struct qnode *)base + idx)->mcs; -} - -#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) - -#if _Q_PENDING_BITS == 8 -/** - * clear_pending - clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,* -> *,0,* - */ -static __always_inline void clear_pending(struct qspinlock *lock) -{ - WRITE_ONCE(lock->pending, 0); -} - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - * - * Lock stealing is not allowed if this function is used. - */ -static __always_inline void clear_pending_set_locked(struct qspinlock *lock) -{ - WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); -} - -/* - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail), which heads an address dependency - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - /* - * We can use relaxed semantics since the caller ensures that the - * MCS node is properly initialized before updating the tail. - */ - return (u32)xchg_relaxed(&lock->tail, - tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; -} - -#else /* _Q_PENDING_BITS == 8 */ - -/** - * clear_pending - clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,* -> *,0,* - */ -static __always_inline void clear_pending(struct qspinlock *lock) -{ - atomic_andnot(_Q_PENDING_VAL, &lock->val); -} - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - */ -static __always_inline void clear_pending_set_locked(struct qspinlock *lock) -{ - atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); -} - -/** - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail) - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) -{ - u32 old, new; - - old = atomic_read(&lock->val); - do { - new = (old & _Q_LOCKED_PENDING_MASK) | tail; - /* - * We can use relaxed semantics since the caller ensures that - * the MCS node is properly initialized before updating the - * tail. - */ - } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); - - return old; -} -#endif /* _Q_PENDING_BITS == 8 */ - -/** - * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending - * @lock : Pointer to queued spinlock structure - * Return: The previous lock value - * - * *,*,* -> *,1,* - */ -#ifndef queued_fetch_set_pending_acquire -static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) -{ - return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); -} -#endif - -/** - * set_locked - Set the lock bit and own the lock - * @lock: Pointer to queued spinlock structure - * - * *,*,0 -> *,0,1 - */ -static __always_inline void set_locked(struct qspinlock *lock) -{ - WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); -} - +static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]); /* * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for @@ -410,7 +227,7 @@ pv_queue: * any MCS node. This is not the most elegant solution, but is * simple enough. */ - if (unlikely(idx >= MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES)) { lockevent_inc(lock_no_node); while (!queued_spin_trylock(lock)) cpu_relax(); @@ -465,7 +282,7 @@ pv_queue: * head of the waitqueue. */ if (old & _Q_TAIL_MASK) { - prev = decode_tail(old); + prev = decode_tail(old, qnodes); /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); diff --git a/kernel/locking/qspinlock.h b/kernel/locking/qspinlock.h new file mode 100644 index 000000000000..d69958a844f7 --- /dev/null +++ b/kernel/locking/qspinlock.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Queued spinlock defines + * + * This file contains macro definitions and functions shared between different + * qspinlock slow path implementations. + */ +#ifndef __LINUX_QSPINLOCK_H +#define __LINUX_QSPINLOCK_H + +#include <asm-generic/percpu.h> +#include <linux/percpu-defs.h> +#include <asm-generic/qspinlock.h> +#include <asm-generic/mcs_spinlock.h> + +#define _Q_MAX_NODES 4 + +/* + * The pending bit spinning loop count. + * This heuristic is used to limit the number of lockword accesses + * made by atomic_cond_read_relaxed when waiting for the lock to + * transition out of the "== _Q_PENDING_VAL" state. We don't spin + * indefinitely because there's no guarantee that we'll make forward + * progress. + */ +#ifndef _Q_PENDING_LOOPS +#define _Q_PENDING_LOOPS 1 +#endif + +/* + * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in + * size and four of them will fit nicely in one 64-byte cacheline. For + * pvqspinlock, however, we need more space for extra data. To accommodate + * that, we insert two more long words to pad it up to 32 bytes. IOW, only + * two of them can fit in a cacheline in this case. That is OK as it is rare + * to have more than 2 levels of slowpath nesting in actual use. We don't + * want to penalize pvqspinlocks to optimize for a rare case in native + * qspinlocks. + */ +struct qnode { + struct mcs_spinlock mcs; +#ifdef CONFIG_PARAVIRT_SPINLOCKS + long reserved[2]; +#endif +}; + +/* + * We must be able to distinguish between no-tail and the tail at 0:0, + * therefore increment the cpu number by one. + */ + +static inline __pure u32 encode_tail(int cpu, int idx) +{ + u32 tail; + + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline __pure struct mcs_spinlock *decode_tail(u32 tail, + struct qnode __percpu *qnodes) +{ + int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return per_cpu_ptr(&qnodes[idx].mcs, cpu); +} + +static inline __pure +struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) +{ + return &((struct qnode *)base + idx)->mcs; +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +#if _Q_PENDING_BITS == 8 +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + WRITE_ONCE(lock->pending, 0); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); +} + +/* + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail), which heads an address dependency + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + /* + * We can use relaxed semantics since the caller ensures that the + * MCS node is properly initialized before updating the tail. + */ + return (u32)xchg_relaxed(&lock->tail, + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; +} + +#else /* _Q_PENDING_BITS == 8 */ + +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_andnot(_Q_PENDING_VAL, &lock->val); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + */ +static __always_inline void clear_pending_set_locked(struct qspinlock *lock) +{ + atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val); +} + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = (old & _Q_LOCKED_PENDING_MASK) | tail; + /* + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. + */ + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return old; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/** + * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending + * @lock : Pointer to queued spinlock structure + * Return: The previous lock value + * + * *,*,* -> *,1,* + */ +#ifndef queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); +} +#endif + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(struct qspinlock *lock) +{ + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); +} + +#endif /* __LINUX_QSPINLOCK_H */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4a8df1800cbb..c80902eacd79 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -27,6 +27,7 @@ #include <trace/events/lock.h> #include "rtmutex_common.h" +#include "lock_events.h" #ifndef WW_RT # define build_ww_mutex() (false) @@ -1612,10 +1613,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct task_struct *owner; int ret = 0; + lockevent_inc(rtmutex_slow_block); for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) + if (try_to_take_rt_mutex(lock, current, waiter)) { + lockevent_inc(rtmutex_slow_acq3); break; + } if (timeout && !timeout->task) { ret = -ETIMEDOUT; @@ -1638,8 +1642,10 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) { + lockevent_inc(rtmutex_slow_sleep); rt_mutex_schedule(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @@ -1694,6 +1700,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, int ret; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtmutex_slowlock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { @@ -1701,6 +1708,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq1); return 0; } @@ -1719,10 +1727,12 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq2); } else { __set_current_state(TASK_RUNNING); remove_waiter(lock, waiter); rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); + lockevent_inc(rtmutex_deadlock); } /* @@ -1751,6 +1761,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, &waiter, wake_q); debug_rt_mutex_free_waiter(&waiter); + lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q)); return ret; } @@ -1823,9 +1834,12 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, struct task_struct *owner; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtlock_slowlock); - if (try_to_take_rt_mutex(lock, current, NULL)) + if (try_to_take_rt_mutex(lock, current, NULL)) { + lockevent_inc(rtlock_slow_acq1); return; + } rt_mutex_init_rtlock_waiter(&waiter); @@ -1838,8 +1852,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, for (;;) { /* Try to acquire the lock again */ - if (try_to_take_rt_mutex(lock, current, &waiter)) + if (try_to_take_rt_mutex(lock, current, &waiter)) { + lockevent_inc(rtlock_slow_acq2); break; + } if (&waiter == rt_mutex_top_waiter(lock)) owner = rt_mutex_owner(lock); @@ -1847,8 +1863,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, owner = NULL; raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) { + lockevent_inc(rtlock_slow_sleep); schedule_rtlock(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_RTLOCK_WAIT); @@ -1865,6 +1883,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, debug_rt_mutex_free_waiter(&waiter); trace_contention_end(lock, 0); + lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q)); } static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index c38a2d2d4a7e..78dd3d8c6554 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -59,8 +59,8 @@ struct rt_mutex_waiter { }; /** - * rt_wake_q_head - Wrapper around regular wake_q_head to support - * "sleeping" spinlocks on RT + * struct rt_wake_q_head - Wrapper around regular wake_q_head to support + * "sleeping" spinlocks on RT * @head: The regular wake_q_head for sleeping lock variants * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups */ diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 34bfae72f295..de9117c0e671 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -29,6 +29,7 @@ #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> +#include <linux/sched/wake_q.h> #include <linux/semaphore.h> #include <linux/spinlock.h> #include <linux/ftrace.h> @@ -38,7 +39,7 @@ static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); static noinline int __down_killable(struct semaphore *sem); static noinline int __down_timeout(struct semaphore *sem, long timeout); -static noinline void __up(struct semaphore *sem); +static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q); /** * down - acquire the semaphore @@ -183,13 +184,16 @@ EXPORT_SYMBOL(down_timeout); void __sched up(struct semaphore *sem) { unsigned long flags; + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else - __up(sem); + __up(sem, &wake_q); raw_spin_unlock_irqrestore(&sem->lock, flags); + if (!wake_q_empty(&wake_q)) + wake_up_q(&wake_q); } EXPORT_SYMBOL(up); @@ -269,11 +273,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout); } -static noinline void __sched __up(struct semaphore *sem) +static noinline void __sched __up(struct semaphore *sem, + struct wake_q_head *wake_q) { struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, struct semaphore_waiter, list); list_del(&waiter->list); waiter->up = true; - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); } diff --git a/kernel/module/internal.h b/kernel/module/internal.h index d09b46ef032f..626cf8668a7e 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -124,17 +124,6 @@ char *module_next_tag_pair(char *string, unsigned long *secsize); #define for_each_modinfo_entry(entry, info, name) \ for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry)) -static inline void module_assert_mutex_or_preempt(void) -{ -#ifdef CONFIG_LOCKDEP - if (unlikely(!debug_locks)) - return; - - WARN_ON_ONCE(!rcu_read_lock_sched_held() && - !lockdep_is_held(&module_mutex)); -#endif -} - static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym) { #ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index bf65e0c3c86f..00a60796327c 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -177,19 +177,15 @@ void add_kallsyms(struct module *mod, const struct load_info *info) unsigned long strtab_size; void *data_base = mod->mem[MOD_DATA].base; void *init_data_base = mod->mem[MOD_INIT_DATA].base; + struct mod_kallsyms *kallsyms; - /* Set up to point into init section. */ - mod->kallsyms = (void __rcu *)init_data_base + - info->mod_kallsyms_init_off; + kallsyms = init_data_base + info->mod_kallsyms_init_off; - rcu_read_lock(); - /* The following is safe since this pointer cannot change */ - rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr; - rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + kallsyms->symtab = (void *)symsec->sh_addr; + kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - rcu_dereference(mod->kallsyms)->strtab = - (void *)info->sechdrs[info->index.str].sh_addr; - rcu_dereference(mod->kallsyms)->typetab = init_data_base + info->init_typeoffs; + kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + kallsyms->typetab = init_data_base + info->init_typeoffs; /* * Now populate the cut down core kallsyms for after init @@ -199,20 +195,19 @@ void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_kallsyms.strtab = s = data_base + info->stroffs; mod->core_kallsyms.typetab = data_base + info->core_typeoffs; strtab_size = info->core_typeoffs - info->stroffs; - src = rcu_dereference(mod->kallsyms)->symtab; - for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) { - rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info); + src = kallsyms->symtab; + for (ndst = i = 0; i < kallsyms->num_symtab; i++) { + kallsyms->typetab[i] = elf_type(src + i, info); if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { ssize_t ret; mod->core_kallsyms.typetab[ndst] = - rcu_dereference(mod->kallsyms)->typetab[i]; + kallsyms->typetab[i]; dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; - ret = strscpy(s, - &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name], + ret = strscpy(s, &kallsyms->strtab[src[i].st_name], strtab_size); if (ret < 0) break; @@ -220,7 +215,9 @@ void add_kallsyms(struct module *mod, const struct load_info *info) strtab_size -= ret + 1; } } - rcu_read_unlock(); + + /* Set up to point into init section. */ + rcu_assign_pointer(mod->kallsyms, kallsyms); mod->core_kallsyms.num_symtab = ndst; } @@ -260,7 +257,7 @@ static const char *find_kallsyms_symbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval, bestval; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms); struct module_memory *mod_mem; /* At worse, next value is at end of module */ @@ -319,7 +316,7 @@ void * __weak dereference_module_function_descriptor(struct module *mod, /* * For kallsyms to ask for address resolution. NULL means not found. Careful - * not to lock to avoid deadlock on oopses, simply disable preemption. + * not to lock to avoid deadlock on oopses, RCU is enough. */ int module_address_lookup(unsigned long addr, unsigned long *size, @@ -332,7 +329,7 @@ int module_address_lookup(unsigned long addr, int ret = 0; struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address(addr); if (mod) { if (modname) @@ -350,8 +347,6 @@ int module_address_lookup(unsigned long addr, if (sym) ret = strscpy(namebuf, sym, KSYM_NAME_LEN); } - preempt_enable(); - return ret; } @@ -359,7 +354,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) { struct module *mod; - preempt_disable(); + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -371,12 +366,10 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) goto out; strscpy(symname, sym, KSYM_NAME_LEN); - preempt_enable(); return 0; } } out: - preempt_enable(); return -ERANGE; } @@ -385,13 +378,13 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, { struct module *mod; - preempt_disable(); + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { struct mod_kallsyms *kallsyms; if (mod->state == MODULE_STATE_UNFORMED) continue; - kallsyms = rcu_dereference_sched(mod->kallsyms); + kallsyms = rcu_dereference(mod->kallsyms); if (symnum < kallsyms->num_symtab) { const Elf_Sym *sym = &kallsyms->symtab[symnum]; @@ -400,12 +393,10 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); strscpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); - preempt_enable(); return 0; } symnum -= kallsyms->num_symtab; } - preempt_enable(); return -ERANGE; } @@ -413,7 +404,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name) { unsigned int i; - struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); + struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms); for (i = 0; i < kallsyms->num_symtab; i++) { const Elf_Sym *sym = &kallsyms->symtab[i]; @@ -453,23 +444,15 @@ static unsigned long __module_kallsyms_lookup_name(const char *name) /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name) { - unsigned long ret; - /* Don't lock: we're in enough trouble already. */ - preempt_disable(); - ret = __module_kallsyms_lookup_name(name); - preempt_enable(); - return ret; + guard(rcu)(); + return __module_kallsyms_lookup_name(name); } unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name) { - unsigned long ret; - - preempt_disable(); - ret = __find_kallsyms_symbol_value(mod, name); - preempt_enable(); - return ret; + guard(rcu)(); + return __find_kallsyms_symbol_value(mod, name); } int module_kallsyms_on_each_symbol(const char *modname, @@ -490,10 +473,8 @@ int module_kallsyms_on_each_symbol(const char *modname, if (modname && strcmp(modname, mod->name)) continue; - /* Use rcu_dereference_sched() to remain compliant with the sparse tool */ - preempt_disable(); - kallsyms = rcu_dereference_sched(mod->kallsyms); - preempt_enable(); + kallsyms = rcu_dereference_check(mod->kallsyms, + lockdep_is_held(&module_mutex)); for (i = 0; i < kallsyms->num_symtab; i++) { const Elf_Sym *sym = &kallsyms->symtab[i]; diff --git a/kernel/module/main.c b/kernel/module/main.c index 1fb9ad289a6f..debeb0eb15d7 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -67,7 +67,7 @@ /* * Mutex protects: - * 1) List of modules (also safely readable with preempt_disable), + * 1) List of modules (also safely readable within RCU read section), * 2) module_use links, * 3) mod_tree.addr_min/mod_tree.addr_max. * (delete and add uses RCU list operations). @@ -331,7 +331,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, /* * Find an exported symbol and return it, along with, (optional) crc and - * (optional) module which owns it. Needs preempt disabled or module_mutex. + * (optional) module which owns it. Needs RCU or module_mutex. */ bool find_symbol(struct find_symbol_arg *fsa) { @@ -345,8 +345,6 @@ bool find_symbol(struct find_symbol_arg *fsa) struct module *mod; unsigned int i; - module_assert_mutex_or_preempt(); - for (i = 0; i < ARRAY_SIZE(arr); i++) if (find_exported_symbol_in_section(&arr[i], NULL, fsa)) return true; @@ -374,16 +372,14 @@ bool find_symbol(struct find_symbol_arg *fsa) } /* - * Search for module by name: must hold module_mutex (or preempt disabled - * for read-only access). + * Search for module by name: must hold module_mutex (or RCU for read-only + * access). */ struct module *find_module_all(const char *name, size_t len, bool even_unformed) { struct module *mod; - module_assert_mutex_or_preempt(); - list_for_each_entry_rcu(mod, &modules, list, lockdep_is_held(&module_mutex)) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) @@ -454,8 +450,7 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) struct module *mod; unsigned int cpu; - preempt_disable(); - + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -472,13 +467,10 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) per_cpu_ptr(mod->percpu, get_boot_cpu_id()); } - preempt_enable(); return true; } } } - - preempt_enable(); return false; } @@ -795,8 +787,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, async_synchronize_full(); /* Store the name and taints of the last unloaded module for diagnostic purposes */ - strscpy(last_unloaded_module.name, mod->name, sizeof(last_unloaded_module.name)); - strscpy(last_unloaded_module.taints, module_flags(mod, buf, false), sizeof(last_unloaded_module.taints)); + strscpy(last_unloaded_module.name, mod->name); + strscpy(last_unloaded_module.taints, module_flags(mod, buf, false)); free_module(mod); /* someone could wait for the module in add_unformed_module() */ @@ -814,10 +806,9 @@ void __symbol_put(const char *symbol) .gplok = true, }; - preempt_disable(); + guard(rcu)(); BUG_ON(!find_symbol(&fsa)); module_put(fsa.owner); - preempt_enable(); } EXPORT_SYMBOL(__symbol_put); @@ -832,13 +823,12 @@ void symbol_put_addr(void *addr) /* * Even though we hold a reference on the module; we still need to - * disable preemption in order to safely traverse the data structure. + * RCU read section in order to safely traverse the data structure. */ - preempt_disable(); + guard(rcu)(); modaddr = __module_text_address(a); BUG_ON(!modaddr); module_put(modaddr); - preempt_enable(); } EXPORT_SYMBOL_GPL(symbol_put_addr); @@ -1189,7 +1179,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod, getname: /* We must make copy under the lock if we failed to get ref. */ - strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN); + strscpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN); unlock: mutex_unlock(&module_mutex); return fsa.sym; @@ -1221,18 +1211,6 @@ void __weak module_arch_freeing_init(struct module *mod) { } -void *__module_writable_address(struct module *mod, void *loc) -{ - for_class_mod_mem_type(type, text) { - struct module_memory *mem = &mod->mem[type]; - - if (loc >= mem->base && loc < mem->base + mem->size) - return loc + (mem->rw_copy - mem->base); - } - - return loc; -} - static int module_memory_alloc(struct module *mod, enum mod_mem_type type) { unsigned int size = PAGE_ALIGN(mod->mem[type].size); @@ -1250,21 +1228,15 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) if (!ptr) return -ENOMEM; - mod->mem[type].base = ptr; - if (execmem_is_rox(execmem_type)) { - ptr = vzalloc(size); + int err = execmem_make_temp_rw(ptr, size); - if (!ptr) { - execmem_free(mod->mem[type].base); + if (err) { + execmem_free(ptr); return -ENOMEM; } - mod->mem[type].rw_copy = ptr; mod->mem[type].is_rox = true; - } else { - mod->mem[type].rw_copy = mod->mem[type].base; - memset(mod->mem[type].base, 0, size); } /* @@ -1278,18 +1250,29 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) * *do* eventually get freed, but let's just keep things simple * and avoid *any* false positives. */ - kmemleak_not_leak(ptr); + if (!mod->mem[type].is_rox) + kmemleak_not_leak(ptr); + + memset(ptr, 0, size); + mod->mem[type].base = ptr; return 0; } +static void module_memory_restore_rox(struct module *mod) +{ + for_class_mod_mem_type(type, text) { + struct module_memory *mem = &mod->mem[type]; + + if (mem->is_rox) + execmem_restore_rox(mem->base, mem->size); + } +} + static void module_memory_free(struct module *mod, enum mod_mem_type type) { struct module_memory *mem = &mod->mem[type]; - if (mem->is_rox) - vfree(mem->rw_copy); - execmem_free(mem->base); } @@ -1348,7 +1331,7 @@ static void free_module(struct module *mod) mod_tree_remove(mod); /* Remove this module from bug list, this uses list_del_rcu */ module_bug_cleanup(mod); - /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */ + /* Wait for RCU synchronizing before releasing mod->list and buglist. */ synchronize_rcu(); if (try_add_tainted_module(mod)) pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n", @@ -1371,21 +1354,18 @@ void *__symbol_get(const char *symbol) .warn = true, }; - preempt_disable(); - if (!find_symbol(&fsa)) - goto fail; - if (fsa.license != GPL_ONLY) { - pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n", - symbol); - goto fail; + scoped_guard(rcu) { + if (!find_symbol(&fsa)) + return NULL; + if (fsa.license != GPL_ONLY) { + pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n", + symbol); + return NULL; + } + if (strong_try_module_get(fsa.owner)) + return NULL; } - if (strong_try_module_get(fsa.owner)) - goto fail; - preempt_enable(); return (void *)kernel_symbol_value(fsa.sym); -fail: - preempt_enable(); - return NULL; } EXPORT_SYMBOL_GPL(__symbol_get); @@ -2642,7 +2622,6 @@ static int move_module(struct module *mod, struct load_info *info) for_each_mod_mem_type(type) { if (!mod->mem[type].size) { mod->mem[type].base = NULL; - mod->mem[type].rw_copy = NULL; continue; } @@ -2659,7 +2638,6 @@ static int move_module(struct module *mod, struct load_info *info) void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; const char *sname; - unsigned long addr; if (!(shdr->sh_flags & SHF_ALLOC)) continue; @@ -2680,14 +2658,12 @@ static int move_module(struct module *mod, struct load_info *info) ret = PTR_ERR(dest); goto out_err; } - addr = (unsigned long)dest; codetag_section_found = true; } else { enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; - addr = (unsigned long)mod->mem[type].base + offset; - dest = mod->mem[type].rw_copy + offset; + dest = mod->mem[type].base + offset; } if (shdr->sh_type != SHT_NOBITS) { @@ -2710,13 +2686,14 @@ static int move_module(struct module *mod, struct load_info *info) * users of info can keep taking advantage and using the newly * minted official memory area. */ - shdr->sh_addr = addr; + shdr->sh_addr = (unsigned long)dest; pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr, (long)shdr->sh_size, info->secstrings + shdr->sh_name); } return 0; out_err: + module_memory_restore_rox(mod); for (t--; t >= 0; t--) module_memory_free(mod, t); if (codetag_section_found) @@ -2863,17 +2840,8 @@ int __weak module_finalize(const Elf_Ehdr *hdr, return 0; } -int __weak module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - return 0; -} - static int post_relocation(struct module *mod, const struct load_info *info) { - int ret; - /* Sort exception table now relocations are done. */ sort_extable(mod->extable, mod->extable + mod->num_exentries); @@ -2885,24 +2853,7 @@ static int post_relocation(struct module *mod, const struct load_info *info) add_kallsyms(mod, info); /* Arch-specific module finalizing. */ - ret = module_finalize(info->hdr, info->sechdrs, mod); - if (ret) - return ret; - - for_each_mod_mem_type(type) { - struct module_memory *mem = &mod->mem[type]; - - if (mem->is_rox) { - if (!execmem_update_copy(mem->base, mem->rw_copy, - mem->size)) - return -ENOMEM; - - vfree(mem->rw_copy); - mem->rw_copy = NULL; - } - } - - return module_post_finalize(info->hdr, info->sechdrs, mod); + return module_finalize(info->hdr, info->sechdrs, mod); } /* Call module constructors. */ @@ -3049,7 +3000,7 @@ static noinline int do_init_module(struct module *mod) #endif /* * We want to free module_init, but be aware that kallsyms may be - * walking this with preempt disabled. In all the failure paths, we + * walking this within an RCU read section. In all the failure paths, we * call synchronize_rcu(), but we don't want to slow down the success * path. execmem_free() cannot be called in an interrupt, so do the * work and call synchronize_rcu() in a work queue. @@ -3499,6 +3450,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod->mem[type].size); } + module_memory_restore_rox(mod); module_deallocate(mod, info); free_copy: /* @@ -3715,28 +3667,23 @@ out: /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { - const struct exception_table_entry *e = NULL; struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address(addr); if (!mod) - goto out; + return NULL; if (!mod->num_exentries) - goto out; - - e = search_extable(mod->extable, - mod->num_exentries, - addr); -out: - preempt_enable(); - + return NULL; /* - * Now, if we found one, we are running inside it now, hence - * we cannot unload the module, hence no refcnt needed. + * The address passed here belongs to a module that is currently + * invoked (we are running inside it). Therefore its module::refcnt + * needs already be >0 to ensure that it is not removed at this stage. + * All other user need to invoke this function within a RCU read + * section. */ - return e; + return search_extable(mod->extable, mod->num_exentries, addr); } /** @@ -3748,20 +3695,15 @@ out: */ bool is_module_address(unsigned long addr) { - bool ret; - - preempt_disable(); - ret = __module_address(addr) != NULL; - preempt_enable(); - - return ret; + guard(rcu)(); + return __module_address(addr) != NULL; } /** * __module_address() - get the module which contains an address. * @addr: the address. * - * Must be called with preempt disabled or module mutex held so that + * Must be called within RCU read section or module mutex held so that * module doesn't get freed during this. */ struct module *__module_address(unsigned long addr) @@ -3779,8 +3721,6 @@ struct module *__module_address(unsigned long addr) return NULL; lookup: - module_assert_mutex_or_preempt(); - mod = mod_find(addr, &mod_tree); if (mod) { BUG_ON(!within_module(addr, mod)); @@ -3800,20 +3740,15 @@ lookup: */ bool is_module_text_address(unsigned long addr) { - bool ret; - - preempt_disable(); - ret = __module_text_address(addr) != NULL; - preempt_enable(); - - return ret; + guard(rcu)(); + return __module_text_address(addr) != NULL; } /** * __module_text_address() - get the module whose code contains an address. * @addr: the address. * - * Must be called with preempt disabled or module mutex held so that + * Must be called within RCU read section or module mutex held so that * module doesn't get freed during this. */ struct module *__module_text_address(unsigned long addr) @@ -3836,7 +3771,7 @@ void print_modules(void) printk(KERN_DEFAULT "Modules linked in:"); /* Most callers should already have preempt disabled, but make sure */ - preempt_disable(); + guard(rcu)(); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; @@ -3844,7 +3779,6 @@ void print_modules(void) } print_unloaded_tainted_modules(); - preempt_enable(); if (last_unloaded_module.name[0]) pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name, last_unloaded_module.taints); diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index 74834ba15615..03f4142cfbf4 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> +#include <linux/execmem.h> #include "internal.h" static int module_set_memory(const struct module *mod, enum mod_mem_type type, @@ -32,12 +33,12 @@ static int module_set_memory(const struct module *mod, enum mod_mem_type type, int module_enable_text_rox(const struct module *mod) { for_class_mod_mem_type(type, text) { + const struct module_memory *mem = &mod->mem[type]; int ret; - if (mod->mem[type].is_rox) - continue; - - if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + if (mem->is_rox) + ret = execmem_restore_rox(mem->base, mem->size); + else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) ret = module_set_memory(mod, type, set_memory_rox); else ret = module_set_memory(mod, type, set_memory_x); diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c index 16742d1c630c..4fefec5b683c 100644 --- a/kernel/module/tracking.c +++ b/kernel/module/tracking.c @@ -21,8 +21,6 @@ int try_add_tainted_module(struct module *mod) { struct mod_unload_taint *mod_taint; - module_assert_mutex_or_preempt(); - if (!mod->taints) goto out; diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index 277197977d43..d3204c5c74eb 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -12,11 +12,11 @@ /* * Use a latched RB-tree for __module_address(); this allows us to use - * RCU-sched lookups of the address from any context. + * RCU lookups of the address from any context. * - * This is conditional on PERF_EVENTS || TRACING because those can really hit - * __module_address() hard by doing a lot of stack unwinding; potentially from - * NMI context. + * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can + * really hit __module_address() hard by doing a lot of stack unwinding; + * potentially from NMI context. */ static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) diff --git a/kernel/module/version.c b/kernel/module/version.c index 3718a8868321..2beefeba82d9 100644 --- a/kernel/module/version.c +++ b/kernel/module/version.c @@ -79,17 +79,17 @@ int check_modstruct_version(const struct load_info *info, .name = "module_layout", .gplok = true, }; + bool have_symbol; /* * Since this should be found in kernel (which can't be removed), no - * locking is necessary -- use preempt_disable() to placate lockdep. + * locking is necessary. Regardless use a RCU read section to keep + * lockdep happy. */ - preempt_disable(); - if (!find_symbol(&fsa)) { - preempt_enable(); - BUG(); - } - preempt_enable(); + scoped_guard(rcu) + have_symbol = find_symbol(&fsa); + BUG_ON(!have_symbol); + return check_version(info, "module_layout", mod, fsa.crc); } diff --git a/kernel/padata.c b/kernel/padata.c index 418987056340..b3d4eacc4f5d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -290,7 +290,7 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd, if (remove_object) { list_del_init(&padata->list); ++pd->processed; - pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false); + pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu); } spin_unlock(&reorder->lock); diff --git a/kernel/panic.c b/kernel/panic.c index d8635d5cecb2..0c55eec9e874 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -511,6 +511,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { TAINT_FLAG(AUX, 'X', ' ', true), TAINT_FLAG(RANDSTRUCT, 'T', ' ', true), TAINT_FLAG(TEST, 'N', ' ', true), + TAINT_FLAG(FWCTL, 'J', ' ', true), }; #undef TAINT_FLAG diff --git a/kernel/params.c b/kernel/params.c index 0074d29c9b80..2509f216c9f3 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -551,7 +551,7 @@ struct module_param_attrs { unsigned int num; struct attribute_group grp; - struct param_attribute attrs[]; + struct param_attribute attrs[] __counted_by(num); }; #ifdef CONFIG_SYSFS @@ -651,35 +651,32 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, } /* Enlarge allocations. */ - new_mp = krealloc(mk->mp, - sizeof(*mk->mp) + - sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), + new_mp = krealloc(mk->mp, struct_size(mk->mp, attrs, mk->mp->num + 1), GFP_KERNEL); if (!new_mp) return -ENOMEM; mk->mp = new_mp; + mk->mp->num++; /* Extra pointer for NULL terminator */ - new_attrs = krealloc(mk->mp->grp.attrs, - sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), - GFP_KERNEL); + new_attrs = krealloc_array(mk->mp->grp.attrs, mk->mp->num + 1, + sizeof(mk->mp->grp.attrs[0]), GFP_KERNEL); if (!new_attrs) return -ENOMEM; mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ - memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); - sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); - mk->mp->attrs[mk->mp->num].param = kp; - mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; + memset(&mk->mp->attrs[mk->mp->num - 1], 0, sizeof(mk->mp->attrs[0])); + sysfs_attr_init(&mk->mp->attrs[mk->mp->num - 1].mattr.attr); + mk->mp->attrs[mk->mp->num - 1].param = kp; + mk->mp->attrs[mk->mp->num - 1].mattr.show = param_attr_show; /* Do not allow runtime DAC changes to make param writable. */ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) - mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; + mk->mp->attrs[mk->mp->num - 1].mattr.store = param_attr_store; else - mk->mp->attrs[mk->mp->num].mattr.store = NULL; - mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; - mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; - mk->mp->num++; + mk->mp->attrs[mk->mp->num - 1].mattr.store = NULL; + mk->mp->attrs[mk->mp->num - 1].mattr.attr.name = (char *)name; + mk->mp->attrs[mk->mp->num - 1].mattr.attr.mode = kp->perm; /* Fix up all the pointers, since krealloc can move us */ for (i = 0; i < mk->mp->num; i++) diff --git a/kernel/pid.c b/kernel/pid.c index 924084713be8..4ac2ce46817f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -88,20 +88,6 @@ struct pid_namespace init_pid_ns = { }; EXPORT_SYMBOL_GPL(init_pid_ns); -/* - * Note: disable interrupts while the pidmap_lock is held as an - * interrupt might come in and do read_lock(&tasklist_lock). - * - * If we don't disable interrupts there is a nasty deadlock between - * detach_pid()->free_pid() and another cpu that does - * spin_lock(&pidmap_lock) followed by an interrupt routine that does - * read_lock(&tasklist_lock); - * - * After we clean up the tasklist_lock and know there are no - * irq handlers that take it we can leave the interrupts enabled. - * For now it is easier to be safe than to prove it can't happen. - */ - static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); @@ -128,11 +114,11 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { - /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; - unsigned long flags; - spin_lock_irqsave(&pidmap_lock, flags); + lockdep_assert_not_held(&tasklist_lock); + + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; @@ -155,11 +141,23 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } pidfs_remove_pid(pid); - spin_unlock_irqrestore(&pidmap_lock, flags); + spin_unlock(&pidmap_lock); call_rcu(&pid->rcu, delayed_put_pid); } +void free_pids(struct pid **pids) +{ + int tmp; + + /* + * This can batch pidmap_lock. + */ + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) + if (pids[tmp]) + free_pid(pids[tmp]); +} + struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size) { @@ -211,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, @@ -238,7 +236,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); if (nr < 0) { @@ -272,7 +270,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, upid = pid->numbers + ns->level; idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; pidfs_add_pid(pid); @@ -281,18 +279,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); return pid; out_unlock: - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); put_pid_ns(ns); out_free: - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); @@ -302,7 +300,7 @@ out_free: if (ns->pid_allocated == PIDNS_ADDING) idr_set_cursor(&ns->idr, 0); - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); @@ -310,9 +308,9 @@ out_free: void disable_pid_allocation(struct pid_namespace *ns) { - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); ns->pid_allocated &= ~PIDNS_ADDING; - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) @@ -339,17 +337,23 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) */ void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid *pid = *task_pid_ptr(task, type); + struct pid *pid; + + lockdep_assert_held_write(&tasklist_lock); + + pid = *task_pid_ptr(task, type); hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } -static void __change_pid(struct task_struct *task, enum pid_type type, - struct pid *new) +static void __change_pid(struct pid **pids, struct task_struct *task, + enum pid_type type, struct pid *new) { - struct pid **pid_ptr = task_pid_ptr(task, type); - struct pid *pid; + struct pid **pid_ptr, *pid; int tmp; + lockdep_assert_held_write(&tasklist_lock); + + pid_ptr = task_pid_ptr(task, type); pid = *pid_ptr; hlist_del_rcu(&task->pid_links[type]); @@ -364,18 +368,19 @@ static void __change_pid(struct task_struct *task, enum pid_type type, if (pid_has_task(pid, tmp)) return; - free_pid(pid); + WARN_ON(pids[type]); + pids[type] = pid; } -void detach_pid(struct task_struct *task, enum pid_type type) +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type) { - __change_pid(task, type, NULL); + __change_pid(pids, task, type, NULL); } -void change_pid(struct task_struct *task, enum pid_type type, +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type, struct pid *pid) { - __change_pid(task, type, pid); + __change_pid(pids, task, type, pid); attach_pid(task, type); } @@ -386,6 +391,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right) struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + lockdep_assert_held_write(&tasklist_lock); + /* Swap the single entry tid lists */ hlists_swap_heads_rcu(head1, head2); @@ -403,6 +410,7 @@ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { WARN_ON_ONCE(type == PIDTYPE_PID); + lockdep_assert_held_write(&tasklist_lock); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } @@ -564,15 +572,29 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) */ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) { - unsigned int f_flags; + unsigned int f_flags = 0; struct pid *pid; struct task_struct *task; + enum pid_type type; - pid = pidfd_get_pid(pidfd, &f_flags); - if (IS_ERR(pid)) - return ERR_CAST(pid); + switch (pidfd) { + case PIDFD_SELF_THREAD: + type = PIDTYPE_PID; + pid = get_task_pid(current, type); + break; + case PIDFD_SELF_THREAD_GROUP: + type = PIDTYPE_TGID; + pid = get_task_pid(current, type); + break; + default: + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + type = PIDTYPE_TGID; + break; + } - task = get_pid_task(pid, PIDTYPE_TGID); + task = get_pid_task(pid, type); put_pid(pid); if (!task) return ERR_PTR(-ESRCH); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 8f6cfec87555..7098ed44e717 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -107,7 +107,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns goto out_free_idr; ns->ns.ops = &pidns_operations; - ns->pid_max = parent_pid_ns->pid_max; + ns->pid_max = PID_MAX_LIMIT; err = register_pidns_sysctls(ns); if (err) goto out_free_inum; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ca947ed32e3d..54a623680019 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -380,8 +380,7 @@ config CPU_PM config ENERGY_MODEL bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)" - depends on SMP - depends on CPU_FREQ + depends on CPU_FREQ || PM_DEVFREQ help Several subsystems (thermal and/or the task scheduler for example) can leverage information about the energy consumed by devices to diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 3874f0e97651..d9b7e2b38c7a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -161,22 +161,10 @@ static void em_debug_create_pd(struct device *dev) {} static void em_debug_remove_pd(struct device *dev) {} #endif -static void em_destroy_table_rcu(struct rcu_head *rp) -{ - struct em_perf_table __rcu *table; - - table = container_of(rp, struct em_perf_table, rcu); - kfree(table); -} - static void em_release_table_kref(struct kref *kref) { - struct em_perf_table __rcu *table; - /* It was the last owner of this table so we can free */ - table = container_of(kref, struct em_perf_table, kref); - - call_rcu(&table->rcu, em_destroy_table_rcu); + kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu); } /** @@ -185,7 +173,7 @@ static void em_release_table_kref(struct kref *kref) * * No return values. */ -void em_table_free(struct em_perf_table __rcu *table) +void em_table_free(struct em_perf_table *table) { kref_put(&table->kref, em_release_table_kref); } @@ -198,9 +186,9 @@ void em_table_free(struct em_perf_table __rcu *table) * has a user. * Returns allocated table or NULL. */ -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) { - struct em_perf_table __rcu *table; + struct em_perf_table *table; int table_size; table_size = sizeof(struct em_perf_state) * pd->nr_perf_states; @@ -239,7 +227,7 @@ static void em_init_performance(struct device *dev, struct em_perf_domain *pd, } static int em_compute_costs(struct device *dev, struct em_perf_state *table, - struct em_data_callback *cb, int nr_states, + const struct em_data_callback *cb, int nr_states, unsigned long flags) { unsigned long prev_cost = ULONG_MAX; @@ -308,9 +296,9 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, * Return 0 on success or an error code on failure. */ int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table) + struct em_perf_table *new_table) { - struct em_perf_table __rcu *old_table; + struct em_perf_table *old_table; struct em_perf_domain *pd; if (!dev) @@ -327,7 +315,8 @@ int em_dev_update_perf_domain(struct device *dev, kref_get(&new_table->kref); - old_table = pd->em_table; + old_table = rcu_dereference_protected(pd->em_table, + lockdep_is_held(&em_pd_mutex)); rcu_assign_pointer(pd->em_table, new_table); em_cpufreq_update_efficiencies(dev, new_table->state); @@ -341,7 +330,7 @@ EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, struct em_perf_state *table, - struct em_data_callback *cb, + const struct em_data_callback *cb, unsigned long flags) { unsigned long power, freq, prev_freq = 0; @@ -396,10 +385,11 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, } static int em_create_pd(struct device *dev, int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, + const struct em_data_callback *cb, + const cpumask_t *cpus, unsigned long flags) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; int cpu, ret, num_cpus; @@ -556,9 +546,10 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * Return 0 on success */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, - bool microwatts) + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { + struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; unsigned long flags = 0; int cpu, ret; @@ -631,7 +622,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, dev->em_pd->min_perf_state = 0; dev->em_pd->max_perf_state = nr_states - 1; - em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state); + em_table = rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex)); + em_cpufreq_update_efficiencies(dev, em_table->state); em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); @@ -668,7 +661,8 @@ void em_dev_unregister_perf_domain(struct device *dev) mutex_lock(&em_pd_mutex); em_debug_remove_pd(dev); - em_table_free(dev->em_pd->em_table); + em_table_free(rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex))); kfree(dev->em_pd); dev->em_pd = NULL; @@ -676,9 +670,9 @@ void em_dev_unregister_perf_domain(struct device *dev) } EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); -static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) +static struct em_perf_table *em_table_dup(struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_state *ps, *new_ps; int ps_size; @@ -700,7 +694,7 @@ static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) } static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, - struct em_perf_table __rcu *em_table) + struct em_perf_table *em_table) { int ret; @@ -728,10 +722,9 @@ free_em_table: * are correctly calculated. */ static void em_adjust_new_capacity(struct device *dev, - struct em_perf_domain *pd, - u64 max_cap) + struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; em_table = em_table_dup(pd); if (!em_table) { @@ -775,7 +768,8 @@ static void em_check_capacity_update(void) } cpufreq_cpu_put(policy); - pd = em_cpu_get(cpu); + dev = get_cpu_device(cpu); + pd = em_pd_get(dev); if (!pd || em_is_artificial(pd)) continue; @@ -799,8 +793,7 @@ static void em_check_capacity_update(void) pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu, cpu_capacity, em_max_perf); - dev = get_cpu_device(cpu); - em_adjust_new_capacity(dev, pd, cpu_capacity); + em_adjust_new_capacity(dev, pd); } free_cpumask_var(cpu_done_mask); @@ -822,7 +815,7 @@ static void em_update_workfn(struct work_struct *work) */ int em_dev_update_chip_binning(struct device *dev) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; int i, ret; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 10a01af63a80..23c0f4e6cb2f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) "PM: hibernation: " fmt +#include <crypto/acompress.h> #include <linux/blkdev.h> #include <linux/export.h> #include <linux/suspend.h> @@ -411,7 +412,7 @@ int hibernation_snapshot(int platform_mode) goto Thaw; } - suspend_console(); + console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -437,7 +438,7 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); - resume_console(); + console_resume_all(); dpm_complete(msg); Close: @@ -547,7 +548,7 @@ int hibernation_restore(int platform_mode) int error; pm_prepare_console(); - suspend_console(); + console_suspend_all(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { @@ -561,7 +562,7 @@ int hibernation_restore(int platform_mode) } dpm_resume_end(PMSG_RECOVER); pm_restore_gfp_mask(); - resume_console(); + console_resume_all(); pm_restore_console(); return error; } @@ -586,7 +587,7 @@ int hibernation_platform_enter(void) goto Close; entering_platform_hibernation = true; - suspend_console(); + console_suspend_all(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -639,7 +640,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - resume_console(); + console_resume_all(); Close: hibernation_ops->end(); @@ -757,7 +758,7 @@ int hibernate(void) */ if (!nocompress) { strscpy(hib_comp_algo, hibernate_compressor, sizeof(hib_comp_algo)); - if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) { + if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); return -EOPNOTSUPP; } @@ -901,7 +902,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto dpm_complete; - suspend_console(); + console_suspend_all(); error = dpm_suspend(PMSG_FREEZE); if (error) @@ -925,7 +926,7 @@ skip: dpm_resume: dpm_resume(PMSG_THAW); - resume_console(); + console_resume_all(); dpm_complete: dpm_complete(PMSG_THAW); @@ -1008,7 +1009,7 @@ static int software_resume(void) strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4, sizeof(hib_comp_algo)); else strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO, sizeof(hib_comp_algo)); - if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) { + if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) { pr_err("%s compression is not available\n", hib_comp_algo); error = -EOPNOTSUPP; goto Unlock; @@ -1446,10 +1447,10 @@ static const char * const comp_alg_enabled[] = { static int hibernate_compressor_param_set(const char *compressor, const struct kernel_param *kp) { - unsigned int sleep_flags; int index, ret; - sleep_flags = lock_system_sleep(); + if (!mutex_trylock(&system_transition_mutex)) + return -EBUSY; index = sysfs_match_string(comp_alg_enabled, compressor); if (index >= 0) { @@ -1461,7 +1462,7 @@ static int hibernate_compressor_param_set(const char *compressor, ret = index; } - unlock_system_sleep(sleep_flags); + mutex_unlock(&system_transition_mutex); if (ret) pr_debug("Cannot set specified compressor %s\n", diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c9fb559a6399..4e6e24e8b854 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2270,9 +2270,9 @@ int snapshot_read_next(struct snapshot_handle *handle) */ void *kaddr; - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); copy_page(buffer, kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); handle->buffer = buffer; } else { handle->buffer = page_address(page); @@ -2561,9 +2561,9 @@ static void copy_last_highmem_page(void) if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page); + dst = kmap_local_page(last_highmem_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); last_highmem_page = NULL; } } @@ -2881,13 +2881,13 @@ static inline void swap_two_pages_data(struct page *p1, struct page *p2, { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1); - kaddr2 = kmap_atomic(p2); + kaddr1 = kmap_local_page(p1); + kaddr2 = kmap_local_page(p2); copy_page(buf, kaddr1); copy_page(kaddr1, kaddr2); copy_page(kaddr2, buf); - kunmap_atomic(kaddr2); - kunmap_atomic(kaddr1); + kunmap_local(kaddr2); + kunmap_local(kaddr1); } /** diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 09f8397bae15..8eaec4ab121d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -91,6 +91,16 @@ static void s2idle_enter(void) { trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); + /* + * The correctness of the code below depends on the number of online + * CPUs being stable, but CPUs cannot be taken offline or put online + * while it is running. + * + * The s2idle_lock must be acquired before the pending wakeup check to + * prevent pm_system_wakeup() from running as a whole between that check + * and the subsequent s2idle_state update in which case a wakeup event + * would get lost. + */ raw_spin_lock_irq(&s2idle_lock); if (pm_wakeup_pending()) goto out; @@ -98,8 +108,6 @@ static void s2idle_enter(void) s2idle_state = S2IDLE_STATE_ENTER; raw_spin_unlock_irq(&s2idle_lock); - cpus_read_lock(); - /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ @@ -112,8 +120,6 @@ static void s2idle_enter(void) */ wake_up_all_idle_cpus(); - cpus_read_unlock(); - raw_spin_lock_irq(&s2idle_lock); out: @@ -502,7 +508,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (error) goto Close; - suspend_console(); + console_suspend_all(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -521,9 +527,9 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - trace_suspend_resume(TPS("resume_console"), state, true); - resume_console(); - trace_suspend_resume(TPS("resume_console"), state, false); + trace_suspend_resume(TPS("console_resume_all"), state, true); + console_resume_all(); + trace_suspend_resume(TPS("console_resume_all"), state, false); Close: platform_resume_end(state); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 82b884b67152..80ff5f933a62 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "PM: " fmt +#include <crypto/acompress.h> #include <linux/module.h> #include <linux/file.h> #include <linux/delay.h> @@ -635,7 +636,8 @@ static int crc32_threadfn(void *data) */ struct cmp_data { struct task_struct *thr; /* thread */ - struct crypto_comp *cc; /* crypto compressor stream */ + struct crypto_acomp *cc; /* crypto compressor */ + struct acomp_req *cr; /* crypto request */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -656,7 +658,6 @@ static atomic_t compressed_size = ATOMIC_INIT(0); static int compress_threadfn(void *data) { struct cmp_data *d = data; - unsigned int cmp_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || @@ -670,11 +671,13 @@ static int compress_threadfn(void *data) } atomic_set(&d->ready, 0); - cmp_len = CMP_SIZE - CMP_HEADER; - d->ret = crypto_comp_compress(d->cc, d->unc, d->unc_len, - d->cmp + CMP_HEADER, - &cmp_len); - d->cmp_len = cmp_len; + acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + acomp_request_set_src_nondma(d->cr, d->unc, d->unc_len); + acomp_request_set_dst_nondma(d->cr, d->cmp + CMP_HEADER, + CMP_SIZE - CMP_HEADER); + d->ret = crypto_acomp_compress(d->cr); + d->cmp_len = d->cr->dlen; atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); atomic_set_release(&d->stop, 1); @@ -745,13 +748,20 @@ static int save_compressed_image(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); + data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC); if (IS_ERR_OR_NULL(data[thr].cc)) { pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); ret = -EFAULT; goto out_clean; } + data[thr].cr = acomp_request_alloc(data[thr].cc); + if (!data[thr].cr) { + pr_err("Could not allocate comp request\n"); + ret = -ENOMEM; + goto out_clean; + } + data[thr].thr = kthread_run(compress_threadfn, &data[thr], "image_compress/%u", thr); @@ -899,8 +909,8 @@ out_clean: for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); - if (data[thr].cc) - crypto_free_comp(data[thr].cc); + acomp_request_free(data[thr].cr); + crypto_free_acomp(data[thr].cc); } vfree(data); } @@ -1142,7 +1152,8 @@ static int load_image(struct swap_map_handle *handle, */ struct dec_data { struct task_struct *thr; /* thread */ - struct crypto_comp *cc; /* crypto compressor stream */ + struct crypto_acomp *cc; /* crypto compressor */ + struct acomp_req *cr; /* crypto request */ atomic_t ready; /* ready to start flag */ atomic_t stop; /* ready to stop flag */ int ret; /* return code */ @@ -1160,7 +1171,6 @@ struct dec_data { static int decompress_threadfn(void *data) { struct dec_data *d = data; - unsigned int unc_len = 0; while (1) { wait_event(d->go, atomic_read_acquire(&d->ready) || @@ -1174,10 +1184,13 @@ static int decompress_threadfn(void *data) } atomic_set(&d->ready, 0); - unc_len = UNC_SIZE; - d->ret = crypto_comp_decompress(d->cc, d->cmp + CMP_HEADER, d->cmp_len, - d->unc, &unc_len); - d->unc_len = unc_len; + acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + acomp_request_set_src_nondma(d->cr, d->cmp + CMP_HEADER, + d->cmp_len); + acomp_request_set_dst_nondma(d->cr, d->unc, UNC_SIZE); + d->ret = crypto_acomp_decompress(d->cr); + d->unc_len = d->cr->dlen; if (clean_pages_on_decompress) flush_icache_range((unsigned long)d->unc, @@ -1254,13 +1267,20 @@ static int load_compressed_image(struct swap_map_handle *handle, init_waitqueue_head(&data[thr].go); init_waitqueue_head(&data[thr].done); - data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0); + data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC); if (IS_ERR_OR_NULL(data[thr].cc)) { pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc)); ret = -EFAULT; goto out_clean; } + data[thr].cr = acomp_request_alloc(data[thr].cc); + if (!data[thr].cr) { + pr_err("Could not allocate comp request\n"); + ret = -ENOMEM; + goto out_clean; + } + data[thr].thr = kthread_run(decompress_threadfn, &data[thr], "image_decompress/%u", thr); @@ -1507,8 +1527,8 @@ out_clean: for (thr = 0; thr < nr_threads; thr++) { if (data[thr].thr) kthread_stop(data[thr].thr); - if (data[thr].cc) - crypto_free_comp(data[thr].cc); + acomp_request_free(data[thr].cr); + crypto_free_acomp(data[thr].cc); } vfree(data); } diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index a91bdf802967..48a24e7b309d 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -64,6 +64,7 @@ struct dev_printk_info; extern struct printk_ringbuffer *prb; extern bool printk_kthreads_running; +extern bool debug_non_panic_cpus; __printf(4, 0) int vprintk_store(int facility, int level, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 07668433644b..1eea80d0648e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2375,6 +2375,22 @@ void printk_legacy_allow_panic_sync(void) } } +bool __read_mostly debug_non_panic_cpus; + +#ifdef CONFIG_PRINTK_CALLER +static int __init debug_non_panic_cpus_setup(char *str) +{ + debug_non_panic_cpus = true; + pr_info("allow messages from non-panic CPUs in panic()\n"); + + return 0; +} +early_param("debug_non_panic_cpus", debug_non_panic_cpus_setup); +module_param(debug_non_panic_cpus, bool, 0644); +MODULE_PARM_DESC(debug_non_panic_cpus, + "allow messages from non-panic CPUs in panic()"); +#endif + asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) @@ -2391,7 +2407,9 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace) + if (other_cpu_in_panic() && + !debug_non_panic_cpus && + !panic_triggering_all_cpu_backtrace) return 0; printk_get_console_flush_type(&ft); @@ -2461,7 +2479,6 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); -static bool pr_flush(int timeout_ms, bool reset_on_progress); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ @@ -2474,7 +2491,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre static u64 syslog_seq; -static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ @@ -2733,11 +2749,11 @@ module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); /** - * suspend_console - suspend the console subsystem + * console_suspend_all - suspend the console subsystem * * This disables printk() while we go into suspend states */ -void suspend_console(void) +void console_suspend_all(void) { struct console *con; @@ -2760,7 +2776,7 @@ void suspend_console(void) synchronize_srcu(&console_srcu); } -void resume_console(void) +void console_resume_all(void) { struct console_flush_type ft; struct console *con; @@ -3342,7 +3358,12 @@ void console_unblank(void) */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) { + short flags = console_srcu_read_flags(c); + + if (flags & CON_SUSPENDED) + continue; + + if ((flags & CON_ENABLED) && c->unblank) { found_unblank = true; break; } @@ -3379,7 +3400,12 @@ void console_unblank(void) cookie = console_srcu_read_lock(); for_each_console_srcu(c) { - if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) + short flags = console_srcu_read_flags(c); + + if (flags & CON_SUSPENDED) + continue; + + if ((flags & CON_ENABLED) && c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); @@ -3497,10 +3523,10 @@ struct tty_driver *console_device(int *index) /* * Prevent further output on the passed console device so that (for example) - * serial drivers can disable console output before suspending a port, and can + * serial drivers can suspend console output before suspending a port, and can * re-enable output afterwards. */ -void console_stop(struct console *console) +void console_suspend(struct console *console) { __pr_flush(console, 1000, true); console_list_lock(); @@ -3515,9 +3541,9 @@ void console_stop(struct console *console) */ synchronize_srcu(&console_srcu); } -EXPORT_SYMBOL(console_stop); +EXPORT_SYMBOL(console_suspend); -void console_start(struct console *console) +void console_resume(struct console *console) { struct console_flush_type ft; bool is_nbcon; @@ -3542,7 +3568,7 @@ void console_start(struct console *console) __pr_flush(console, 1000, true); } -EXPORT_SYMBOL(console_start); +EXPORT_SYMBOL(console_resume); #ifdef CONFIG_PRINTK static int unregister_console_locked(struct console *console); @@ -4277,6 +4303,11 @@ void __init console_init(void) initcall_t call; initcall_entry_t *ce; +#ifdef CONFIG_NULL_TTY_DEFAULT_CONSOLE + if (!console_set_on_cmdline) + add_preferred_console("ttynull", 0, NULL); +#endif + /* Setup the default TTY line discipline. */ n_tty_init(); @@ -4466,7 +4497,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * Context: Process context. May sleep while acquiring console lock. * Return: true if all usable printers are caught up. */ -static bool pr_flush(int timeout_ms, bool reset_on_progress) +bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 88e8f3a61922..d9fb053cff67 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -2133,9 +2133,9 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * there may be other finalized records beyond that * need to be printed for a panic situation. If this * is the panic CPU, skip this - * non-existent/non-finalized record unless it is - * at or beyond the head, in which case it is not - * possible to continue. + * non-existent/non-finalized record unless non-panic + * CPUs are still running and their debugging is + * explicitly enabled. * * Note that new messages printed on panic CPU are * finalized when we are here. The only exception @@ -2143,10 +2143,13 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ - if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb))) + if (this_cpu_in_panic() && + (!debug_non_panic_cpus || legacy_allow_panic_sync) && + ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; - else + } else { return false; + } } } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b9b6bc55185d..aa42de4d2768 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -18,7 +18,7 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPTION + default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC) select TREE_RCU help This option selects the RCU implementation that is @@ -65,6 +65,17 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config FORCE_NEED_SRCU_NMI_SAFE + bool "Force selection of NEED_SRCU_NMI_SAFE" + depends on !TINY_SRCU + select NEED_SRCU_NMI_SAFE + default n + help + This option forces selection of the NEED_SRCU_NMI_SAFE + Kconfig option, allowing testing of srcu_read_lock_nmisafe() + and srcu_read_unlock_nmisafe() on architectures (like x86) + that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option. + config NEED_SRCU_NMI_SAFE def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU @@ -91,7 +102,7 @@ config NEED_TASKS_RCU config TASKS_RCU bool - default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) + default NEED_TASKS_RCU && PREEMPTION select IRQ_WORK config FORCE_TASKS_RUDE_RCU @@ -323,21 +334,27 @@ config RCU_LAZY depends on RCU_NOCB_CPU default n help - To save power, batch RCU callbacks and flush after delay, memory - pressure, or callback list growing too big. + To save power, batch RCU callbacks and delay starting the + corresponding grace period for multiple seconds. The grace + period will be started after this delay, in case of memory + pressure, or if the corresponding CPU's callback list grows + too large. - Requires rcu_nocbs=all to be set. + These delays happen only on rcu_nocbs CPUs, that is, CPUs + whose callbacks have been offloaded. - Use rcutree.enable_rcu_lazy=0 to turn it off at boot time. + Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to + globally disable these delays. config RCU_LAZY_DEFAULT_OFF bool "Turn RCU lazy invocation off by default" depends on RCU_LAZY default n help - Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default - off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch - it back on. + Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel + to boot with these energy-efficiency delays disabled. Use the + rcutree.enable_rcu_lazy=0 kernel-boot parameter to override + the this option at boot time, thus re-enabling these delays. config RCU_DOUBLE_CHECK_CB_TIME bool "RCU callback-batch backup time check" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 6af90510a1ca..12e4c64ebae1 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -54,7 +54,7 @@ config RCU_TORTURE_TEST Say N if you are unsure. config RCU_TORTURE_TEST_CHK_RDR_STATE - tristate "Check rcutorture reader state" + bool "Check rcutorture reader state" depends on RCU_TORTURE_TEST default n help @@ -70,7 +70,7 @@ config RCU_TORTURE_TEST_CHK_RDR_STATE Say N if you are unsure. config RCU_TORTURE_TEST_LOG_CPU - tristate "Log CPU for rcutorture failures" + bool "Log CPU for rcutorture failures" depends on RCU_TORTURE_TEST default n help @@ -84,6 +84,20 @@ config RCU_TORTURE_TEST_LOG_CPU Say Y here if you want CPU IDs logged. Say N if you are unsure. +config RCU_TORTURE_TEST_LOG_GP + bool "Log grace-period numbers for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + corresponding grace-period sequence numbers. This information + can be useful, but it does incur additional overhead, overhead + that can make both failures and close calls less probable. + + Say Y here if you want grace-period sequence numbers logged. + Say N if you are unsure. + config RCU_REF_SCALE_TEST tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index feb3ac1dc5d5..eed2951a4962 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -162,7 +162,7 @@ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) { unsigned long cur_s = READ_ONCE(*sp); - return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1)); } /* @@ -590,6 +590,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif static inline void rcu_gp_set_torture_wait(int duration) { } #endif +unsigned long long rcutorture_gather_gp_seqs(void); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len); #ifdef CONFIG_TINY_SRCU @@ -611,8 +613,6 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } -static inline unsigned long -srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; } static inline void show_rcu_gp_kthreads(void) { } @@ -624,7 +624,6 @@ static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } bool rcu_watching_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); -unsigned long srcu_batches_completed(struct srcu_struct *sp); bool rcu_check_boost_fail(unsigned long gp_state, int *cpup); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); @@ -636,6 +635,12 @@ void rcu_gp_slow_register(atomic_t *rgssp); void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TINY_SRCU +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } +#else // #ifdef CONFIG_TINY_SRCU +unsigned long srcu_batches_completed(struct srcu_struct *sp); +#endif // #else // #ifdef CONFIG_TINY_SRCU + #ifdef CONFIG_RCU_NOCB_CPU void rcu_bind_current_to_nocb(void); #else diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d26fb1d33ed9..65095664f5c5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -135,6 +135,7 @@ torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); +torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds."); torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); @@ -147,6 +148,7 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); static int nrealnocbers; static int nrealreaders; +static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; @@ -272,6 +274,9 @@ struct rt_read_seg { bool rt_preempted; int rt_cpu; int rt_end_cpu; + unsigned long long rt_gp_seq; + unsigned long long rt_gp_seq_end; + u64 rt_ts; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; @@ -406,6 +411,8 @@ struct rcu_torture_ops { void (*gp_slow_register)(atomic_t *rgssp); void (*gp_slow_unregister)(atomic_t *rgssp); bool (*reader_blocked)(void); + unsigned long long (*gather_gp_seqs)(void); + void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len); long cbflood_max; int irq_capable; int can_boost; @@ -610,6 +617,8 @@ static struct rcu_torture_ops rcu_ops = { .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU) ? has_rcu_reader_blocked : NULL, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -655,6 +664,8 @@ static struct rcu_torture_ops rcu_busted_ops = { .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted" @@ -677,8 +688,11 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) static int srcu_torture_read_lock(void) { int idx; + struct srcu_ctr __percpu *scp; int ret = 0; + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { idx = srcu_read_lock(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); @@ -694,6 +708,12 @@ static int srcu_torture_read_lock(void) WARN_ON_ONCE(idx & ~0x1); ret += idx << 2; } + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_read_lock_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 3; + } return ret; } @@ -719,6 +739,8 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); if (reader_flavor & SRCU_READ_FLAVOR_LITE) srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); if (reader_flavor & SRCU_READ_FLAVOR_NMI) @@ -791,6 +813,7 @@ static struct rcu_torture_ops srcu_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -834,6 +857,7 @@ static struct rcu_torture_ops srcud_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -1148,8 +1172,19 @@ static int rcu_torture_boost(void *arg) unsigned long gp_state; unsigned long gp_state_time; unsigned long oldstarttime; + unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ; - VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) { + VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + } else { + VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period"); + while (time_before(jiffies, booststarttime)) { + schedule_timeout_idle(HZ); + if (kthread_should_stop()) + goto cleanup; + } + VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period"); + } /* Set real-time priority. */ sched_set_fifo_low(current); @@ -1225,6 +1260,7 @@ checkwait: if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); } while (!torture_must_stop()); +cleanup: /* Clean up and exit. */ while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); @@ -1728,7 +1764,7 @@ rcu_torture_fakewriter(void *arg) do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && - torture_random(&rand) % (nfakewriters * 8) == 0) { + torture_random(&rand) % (nrealfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else { switch (synctype[torture_random(&rand) % nsynctypes]) { @@ -1873,6 +1909,8 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, #define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) { + int mask; + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) return; @@ -1899,11 +1937,27 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, WARN_ONCE(cur_ops->extendables && !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); - WARN_ONCE(cur_ops->extendables && - !(curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && + + /* + * non-preemptible RCU in a preemptible kernel uses preempt_disable() + * as rcu_read_lock(). + */ + mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + + WARN_ONCE(cur_ops->extendables && !(curstate & mask) && (preempt_count() & PREEMPT_MASK), ROEC_ARGS); - WARN_ONCE(cur_ops->readlock_nesting && - !(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && + + /* + * non-preemptible RCU in a preemptible kernel uses "preempt_count() & + * PREEMPT_MASK" as ->readlock_nesting(). + */ + mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + + WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) && cur_ops->readlock_nesting() > 0, ROEC_ARGS); } @@ -1965,6 +2019,13 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, rtrsp[-1].rt_preempted = cur_ops->reader_blocked(); } } + // Sample grace-period sequence number, as good a place as any. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) { + rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs(); + rtrsp->rt_ts = ktime_get_mono_fast_ns(); + if (!first) + rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq; + } /* * Next, remove old protection, in decreasing order of strength @@ -2512,7 +2573,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " + "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " @@ -2522,11 +2583,11 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d " "preempt_duration=%d preempt_interval=%d\n", - torture_type, tag, nrealreaders, nfakewriters, + torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, + test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, @@ -3553,6 +3614,7 @@ rcu_torture_cleanup(void) int flags = 0; unsigned long gp_seq = 0; int i; + int j; if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) { @@ -3597,7 +3659,7 @@ rcu_torture_cleanup(void) rcu_torture_reader_mbchk = NULL; if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) + for (i = 0; i < nrealfakewriters; i++) torture_stop_kthread(rcu_torture_fakewriter, fakewriter_tasks[i]); kfree(fakewriter_tasks); @@ -3635,7 +3697,11 @@ rcu_torture_cleanup(void) pr_alert("\t: No segments recorded!!!\n"); firsttime = 1; for (i = 0; i < rt_read_nsegs; i++) { - pr_alert("\t%d: %#4x", i, err_segs[i].rt_readstate); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP)) + pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL)); + else + pr_alert("\t"); + pr_cont("%d: %#4x", i, err_segs[i].rt_readstate); if (err_segs[i].rt_delay_jiffies != 0) { pr_cont("%s%ldjiffies", firsttime ? "" : "+", err_segs[i].rt_delay_jiffies); @@ -3648,6 +3714,27 @@ rcu_torture_cleanup(void) else pr_cont(" ..."); } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && + cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) { + char buf1[20+1]; + char buf2[20+1]; + char sepchar = '-'; + + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, + buf1, ARRAY_SIZE(buf1)); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, + buf2, ARRAY_SIZE(buf2)); + if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) { + if (buf2[0]) { + for (j = 0; buf2[j]; j++) + buf2[j] = '.'; + if (j) + buf2[j - 1] = ' '; + } + sepchar = ' '; + } + pr_cont(" %s%c%s", buf1, sepchar, buf2); + } if (err_segs[i].rt_delay_ms != 0) { pr_cont(" %s%ldms", firsttime ? "" : "+", err_segs[i].rt_delay_ms); @@ -3994,6 +4081,14 @@ rcu_torture_init(void) rcu_torture_init_srcu_lockdep(); + if (nfakewriters >= 0) { + nrealfakewriters = nfakewriters; + } else { + nrealfakewriters = num_online_cpus() - 2 - nfakewriters; + if (nrealfakewriters <= 0) + nrealfakewriters = 1; + } + if (nreaders >= 0) { nrealreaders = nreaders; } else { @@ -4050,8 +4145,9 @@ rcu_torture_init(void) writer_task); if (torture_init_error(firsterr)) goto unwind; - if (nfakewriters > 0) { - fakewriter_tasks = kcalloc(nfakewriters, + + if (nrealfakewriters > 0) { + fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -4060,7 +4156,7 @@ rcu_torture_init(void) goto unwind; } } - for (i = 0; i < nfakewriters; i++) { + for (i = 0; i < nrealfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, NULL, fakewriter_tasks[i]); if (torture_init_error(firsterr)) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 1b47376acdc4..f11a7c2af778 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -216,6 +216,36 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static void srcu_fast_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_fast_ref_scale_read_section, + .delaysection = srcu_fast_ref_scale_delay_section, + .name = "srcu-fast" +}; + static void srcu_lite_ref_scale_read_section(const int nloops) { int i; @@ -1163,7 +1193,7 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 4dcbf8aa80ff..6e9fe2ce1075 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -20,7 +20,11 @@ #include "rcu_segcblist.h" #include "rcu.h" +#ifndef CONFIG_TREE_RCU int rcu_scheduler_active __read_mostly; +#else // #ifndef CONFIG_TREE_RCU +extern int rcu_scheduler_active; +#endif // #else // #ifndef CONFIG_TREE_RCU static LIST_HEAD(srcu_boot_list); static bool srcu_init_done; @@ -98,7 +102,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { int newval; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); preempt_enable(); @@ -120,7 +124,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { preempt_enable(); return; /* Already running or nothing to do. */ @@ -138,7 +142,7 @@ void srcu_drive_gp(struct work_struct *wp) WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ preempt_enable(); swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); preempt_enable(); @@ -159,7 +163,7 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_running, false); idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)); preempt_enable(); @@ -172,7 +176,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY cookie = get_state_synchronize_srcu(ssp); if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { preempt_enable(); @@ -199,7 +203,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY local_irq_save(flags); *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; @@ -261,7 +265,7 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) { unsigned long ret; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY ret = get_state_synchronize_srcu(ssp); srcu_gp_start_if_needed(ssp); preempt_enable(); @@ -282,11 +286,13 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); +#ifndef CONFIG_TREE_RCU /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } +#endif // #ifndef CONFIG_TREE_RCU /* * Queue work for srcu_struct structures with early boot callbacks. diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b83c74c4dcc0..d2a694944553 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -116,8 +116,9 @@ do { \ /* * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and - * srcu_read_unlock() running against them. So if the is_static parameter - * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + * srcu_read_unlock() running against them. So if the is_static + * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and + * ->srcu_ctrs[].srcu_unlocks. */ static void init_srcu_struct_data(struct srcu_struct *ssp) { @@ -128,8 +129,6 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); @@ -247,15 +246,16 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_sup->srcu_gp_mutex); - ssp->srcu_idx = 0; ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); ssp->srcu_sup->sda_is_static = is_static; - if (!is_static) + if (!is_static) { ssp->sda = alloc_percpu(struct srcu_data); + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; + } if (!ssp->sda) goto err_free_sup; init_srcu_struct_data(ssp); @@ -429,10 +429,10 @@ static bool srcu_gp_is_expedited(struct srcu_struct *ssp) } /* - * Computes approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx, and returns true if - * the caller did the proper barrier (gp), and if the count of the locks - * matches that of the unlocks passed in. + * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks + * values for the rank of per-CPU counters specified by idx, and returns + * true if the caller did the proper barrier (gp), and if the count of + * the locks matches that of the unlocks passed in. */ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { @@ -443,20 +443,20 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks); if (IS_ENABLED(CONFIG_PROVE_RCU)) mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), "Mixed reader flavors for srcu_struct at %ps.\n", ssp); - if (mask & SRCU_READ_FLAVOR_LITE && !gp) + if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp) return false; return sum == unlocks; } /* - * Returns approximate total of the readers' ->srcu_unlock_count[] values - * for the rank of per-CPU counters specified by idx. + * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks + * values for the rank of per-CPU counters specified by idx. */ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { @@ -467,7 +467,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, u for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks); mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), @@ -487,7 +487,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) unsigned long unlocks; unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); - did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE); + did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP); /* * Make sure that a lock is always counted if the corresponding @@ -509,48 +509,49 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * If the locks are the same as the unlocks, then there must have * been no readers on this index at some point in this function. * But there might be more readers, as a task might have read - * the current ->srcu_idx but not yet have incremented its CPU's - * ->srcu_lock_count[idx] counter. In fact, it is possible + * the current ->srcu_ctrp but not yet have incremented its CPU's + * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible * that most of the tasks have been preempted between fetching - * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there - * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks - * in a system whose address space was fully populated with memory. - * Call this quantity Nt. + * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And + * there could be almost (ULONG_MAX / sizeof(struct task_struct)) + * tasks in a system whose address space was fully populated + * with memory. Call this quantity Nt. * - * So suppose that the updater is preempted at this point in the - * code for a long time. That now-preempted updater has already - * flipped ->srcu_idx (possibly during the preceding grace period), - * done an smp_mb() (again, possibly during the preceding grace - * period), and summed up the ->srcu_unlock_count[idx] counters. - * How many times can a given one of the aforementioned Nt tasks - * increment the old ->srcu_idx value's ->srcu_lock_count[idx] - * counter, in the absence of nesting? + * So suppose that the updater is preempted at this + * point in the code for a long time. That now-preempted + * updater has already flipped ->srcu_ctrp (possibly during + * the preceding grace period), done an smp_mb() (again, + * possibly during the preceding grace period), and summed up + * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times + * can a given one of the aforementioned Nt tasks increment the + * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter, + * in the absence of nesting? * * It can clearly do so once, given that it has already fetched - * the old value of ->srcu_idx and is just about to use that value - * to index its increment of ->srcu_lock_count[idx]. But as soon as - * it leaves that SRCU read-side critical section, it will increment - * ->srcu_unlock_count[idx], which must follow the updater's above - * read from that same value. Thus, as soon the reading task does - * an smp_mb() and a later fetch from ->srcu_idx, that task will be - * guaranteed to get the new index. Except that the increment of - * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the - * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock() - * is before the smp_mb(). Thus, that task might not see the new - * value of ->srcu_idx until the -second- __srcu_read_lock(), - * which in turn means that this task might well increment - * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice, - * not just once. + * the old value of ->srcu_ctrp and is just about to use that + * value to index its increment of ->srcu_ctrs[idx].srcu_locks. + * But as soon as it leaves that SRCU read-side critical section, + * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must + * follow the updater's above read from that same value. Thus, + as soon the reading task does an smp_mb() and a later fetch from + * ->srcu_ctrp, that task will be guaranteed to get the new index. + * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks + * in __srcu_read_unlock() is after the smp_mb(), and the fetch + * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb(). + * Thus, that task might not see the new value of ->srcu_ctrp until + * the -second- __srcu_read_lock(), which in turn means that this + * task might well increment ->srcu_ctrs[idx].srcu_locks for the + * old value of ->srcu_ctrp twice, not just once. * * However, it is important to note that a given smp_mb() takes * effect not just for the task executing it, but also for any * later task running on that same CPU. * - * That is, there can be almost Nt + Nc further increments of - * ->srcu_lock_count[idx] for the old index, where Nc is the number - * of CPUs. But this is OK because the size of the task_struct - * structure limits the value of Nt and current systems limit Nc - * to a few thousand. + * That is, there can be almost Nt + Nc further increments + * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc + * is the number of CPUs. But this is OK because the size of + * the task_struct structure limits the value of Nt and current + * systems limit Nc to a few thousand. * * OK, but what about nesting? This does impose a limit on * nesting of half of the size of the task_struct structure @@ -581,10 +582,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[0]); - sum += atomic_long_read(&sdp->srcu_lock_count[1]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[0]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks); + sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks); + sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks); + sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks); } return sum; } @@ -647,6 +648,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long jbase = SRCU_INTERVAL; struct srcu_usage *sup = ssp->srcu_sup; + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); if (srcu_gp_is_expedited(ssp)) jbase = 0; if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { @@ -674,9 +676,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; + unsigned long delay; struct srcu_usage *sup = ssp->srcu_sup; - if (WARN_ON(!srcu_get_delay(ssp))) + spin_lock_irq_rcu_node(ssp->srcu_sup); + delay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + if (WARN_ON(!delay)) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ @@ -743,12 +749,11 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); */ int __srcu_read_lock(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); + this_cpu_inc(scp->srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scp); } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -760,7 +765,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -773,13 +778,12 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) { - int idx; - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp); + struct srcu_ctr *scp = raw_cpu_ptr(scpp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - atomic_long_inc(&sdp->srcu_lock_count[idx]); + atomic_long_inc(&scp->srcu_locks); smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scpp); } EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); @@ -790,10 +794,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); */ void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ - atomic_long_inc(&sdp->srcu_unlock_count[idx]); + atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks); } EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); @@ -1096,13 +1098,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* * Wait until all readers counted by array index idx complete, but * loop an additional time if there is an expedited grace period pending. - * The caller must ensure that ->srcu_idx is not changed while checking. + * The caller must ensure that ->srcu_ctrp is not changed while checking. */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { unsigned long curdelay; + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = !srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) @@ -1114,30 +1118,30 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) } /* - * Increment the ->srcu_idx counter so that future SRCU readers will + * Increment the ->srcu_ctrp counter so that future SRCU readers will * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *ssp) { /* - * Because the flip of ->srcu_idx is executed only if the + * Because the flip of ->srcu_ctrp is executed only if the * preceding call to srcu_readers_active_idx_check() found that - * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched - * and because that summing uses atomic_long_read(), there is - * ordering due to a control dependency between that summing and - * the WRITE_ONCE() in this call to srcu_flip(). This ordering - * ensures that if this updater saw a given reader's increment from - * __srcu_read_lock(), that reader was using a value of ->srcu_idx - * from before the previous call to srcu_flip(), which should be - * quite rare. This ordering thus helps forward progress because - * the grace period could otherwise be delayed by additional - * calls to __srcu_read_lock() using that old (soon to be new) - * value of ->srcu_idx. + * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums + * matched and because that summing uses atomic_long_read(), + * there is ordering due to a control dependency between that + * summing and the WRITE_ONCE() in this call to srcu_flip(). + * This ordering ensures that if this updater saw a given reader's + * increment from __srcu_read_lock(), that reader was using a value + * of ->srcu_ctrp from before the previous call to srcu_flip(), + * which should be quite rare. This ordering thus helps forward + * progress because the grace period could otherwise be delayed + * by additional calls to __srcu_read_lock() using that old (soon + * to be new) value of ->srcu_ctrp. * * This sum-equality check and ordering also ensures that if * a given call to __srcu_read_lock() uses the new value of - * ->srcu_idx, this updater's earlier scans cannot have seen + * ->srcu_ctrp, this updater's earlier scans cannot have seen * that reader's increments, which is all to the good, because * this grace period need not wait on that reader. After all, * if those earlier scans had seen that reader, there would have @@ -1152,7 +1156,8 @@ static void srcu_flip(struct srcu_struct *ssp) */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. + WRITE_ONCE(ssp->srcu_ctrp, + &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]); /* * Ensure that if the updater misses an __srcu_read_unlock() @@ -1198,7 +1203,7 @@ static bool srcu_should_expedite(struct srcu_struct *ssp) check_init_srcu_struct(ssp); /* If _lite() readers, don't do unsolicited expediting. */ - if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE) + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP) return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); @@ -1398,8 +1403,12 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, * read-side critical sections are delimited by srcu_read_lock() and * srcu_read_unlock(), and may be nested. * - * The callback will be invoked from process context, but must nevertheless - * be fast and must not block. + * The callback will be invoked from process context, but with bh + * disabled. The callback function must therefore be fast and must + * not block. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) @@ -1465,8 +1474,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * * Wait for the count to drain to zero of both indexes. To avoid the * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, - * and then flip the srcu_idx and wait for the count of the other index. + * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero + * at first, and then flip the ->srcu_ctrp and wait for the count of the + * other index. * * Can block; must be called from process context. * @@ -1675,7 +1685,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return READ_ONCE(ssp->srcu_idx); + return READ_ONCE(ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1692,7 +1702,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) /* * Because readers might be delayed for an extended period after - * fetching ->srcu_idx for their index, at any point in time there + * fetching ->srcu_ctrp for their index, at any point in time there * might well be readers using both idx=0 and idx=1. We therefore * need to wait for readers to clear from both index values before * invoking a callback. @@ -1720,7 +1730,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) } if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 1)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1738,7 +1748,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) * SRCU read-side critical sections are normally short, * so check at least twice in quick succession after a flip. */ - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 2)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1849,7 +1859,9 @@ static void process_srcu(struct work_struct *work) ssp = sup->srcu_ssp; srcu_advance_state(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (curdelay) { WRITE_ONCE(sup->reschedule_count, 0); } else { @@ -1896,7 +1908,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); int ss_state_idx = ss_state; - idx = ssp->srcu_idx & 0x1; + idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]; if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; pr_alert("%s%s Tree SRCU g%ld state %d (%s)", @@ -1914,8 +1926,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx])); - u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx])); + u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks)); + u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks)); /* * Make sure that a lock is always counted if the corresponding @@ -1923,8 +1935,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx])); - l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx])); + l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks)); + l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks)); c0 = l0 - u0; c1 = l1 - u1; @@ -2001,6 +2013,7 @@ static int srcu_module_coming(struct module *mod) ssp->sda = alloc_percpu(struct srcu_data); if (WARN_ON_ONCE(!ssp->sda)) return -ENOMEM; + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; } return 0; } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 59314da5eb60..466668eb4fad 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -2256,7 +2256,7 @@ void __init tasks_cblist_init_generic(void) #endif } -void __init rcu_init_tasks_generic(void) +static int __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU rcu_spawn_tasks_kthread(); @@ -2272,7 +2272,10 @@ void __init rcu_init_tasks_generic(void) // Run the self-tests. rcu_tasks_initiate_self_tests(); + + return 0; } +core_initcall(rcu_init_tasks_generic); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b3f31911465..c1ebfd51768b 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -85,15 +85,8 @@ void rcu_sched_clock_irq(int user) static inline bool rcu_reclaim_tiny(struct rcu_head *head) { rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kvfree_rcu_offset(offset)) { - trace_rcu_invoke_kvfree_callback("", head, offset); - kvfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } trace_rcu_invoke_callback("", head); f = head->func; @@ -159,10 +152,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -static void tiny_rcu_leak_callback(struct rcu_head *rhp) -{ -} - /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -178,9 +167,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); mem_dump_obj(head); } - - if (!__is_kvfree_rcu_offset((unsigned long)head->func)) - WRITE_ONCE(head->func, tiny_rcu_leak_callback); return; } @@ -246,15 +232,18 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr) +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) +unsigned long long rcutorture_gather_gp_seqs(void) { - if (head) - kasan_record_aux_stack(ptr); + return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL; +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); - __kvfree_call_rcu(head, ptr); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + snprintf(cp, len, "g%04llx", seqs & 0xffffULL); } -EXPORT_SYMBOL_GPL(kvfree_call_rcu); +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); #endif void __init rcu_init(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed14..659f83e71048 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -538,6 +538,26 @@ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); +/* Gather grace-period sequence numbers for rcutorture diagnostics. */ +unsigned long long rcutorture_gather_gp_seqs(void) +{ + return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) | + ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) | + (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL); +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); + +/* Format grace-period sequence numbers for rcutorture diagnostics. */ +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + unsigned int egp = (seqs >> 16) & 0xffffffULL; + unsigned int ggp = (seqs >> 40) & 0xffffULL; + unsigned int pgp = seqs & 0xffffULL; + + snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp); +} +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); + #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on @@ -1254,7 +1274,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ rdp->core_needs_qs = false; @@ -1268,7 +1288,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Now handle the beginnings of any new-to-this-CPU grace periods. */ if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1283,7 +1303,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); - if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap) WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); @@ -1612,12 +1632,10 @@ static void rcu_sr_normal_complete(struct llist_node *node) { struct rcu_synchronize *rs = container_of( (struct rcu_head *) node, struct rcu_synchronize, head); - unsigned long oldstate = (unsigned long) rs->head.func; WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && - !poll_state_synchronize_rcu(oldstate), - "A full grace period is not passed yet: %lu", - rcu_seq_diff(get_state_synchronize_rcu(), oldstate)); + !poll_state_synchronize_rcu_full(&rs->oldstate), + "A full grace period is not passed yet!\n"); /* Finally. */ complete(&rs->completion); @@ -1801,10 +1819,14 @@ static noinline_for_stack bool rcu_gp_init(void) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(); + /* + * A new wait segment must be started before gp_seq advanced, so + * that previous gp waiters won't observe the new gp_seq. + */ + start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); - start_new_poll = rcu_sr_normal_gp_init(); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); @@ -2931,13 +2953,8 @@ static int __init rcu_spawn_core_kthreads(void) static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) { rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kvfree_rcu_offset((unsigned long)func)) - trace_rcu_kvfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_cbs(&rdp->cblist)); - else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); } @@ -3107,7 +3124,7 @@ module_param(enable_rcu_lazy, bool, 0444); * critical sections have completed. * * Use this API instead of call_rcu() if you don't want the callback to be - * invoked after very long periods of time, which can happen on systems without + * delayed for very long periods of time, which can happen on systems without * memory pressure and on systems which are lightly loaded or mostly idle. * This function will cause callbacks to be invoked sooner than later at the * expense of extra power. Other than that, this function is identical to, and @@ -3138,6 +3155,12 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * might well execute concurrently with RCU read-side critical sections * that started after call_rcu() was invoked. * + * It is perfectly legal to repost an RCU callback, potentially with + * a different callback function, from within its callback function. + * The specified function will be invoked after another full grace period + * has elapsed. This use case is similar in form to the common practice + * of reposting a timer from within its own handler. + * * RCU read-side critical sections are delimited by rcu_read_lock() * and rcu_read_unlock(), and may be nested. In addition, but only in * v5.0 and later, regions of code across which interrupts, preemption, @@ -3166,6 +3189,13 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * * Implementation of these memory-ordering guarantees is described here: * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + * + * Specific to call_rcu() (as opposed to the other call_rcu*() functions), + * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many + * seconds before starting the grace period needed by the corresponding + * callback. This delay can significantly improve energy-efficiency + * on low-utilization battery-powered devices. To avoid this delay, + * in latency-sensitive kernel code, use call_rcu_hurry(). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -3214,7 +3244,7 @@ static void synchronize_rcu_normal(void) * snapshot before adding a request. */ if (IS_ENABLED(CONFIG_PROVE_RCU)) - rs.head.func = (void *) get_state_synchronize_rcu(); + get_state_synchronize_rcu_full(&rs.oldstate); rcu_sr_normal_add_req(&rs); @@ -3357,14 +3387,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { - struct rcu_node *rnp = rcu_get_root(); - /* * Any prior manipulation of RCU-protected data must happen * before the loads from ->gp_seq and ->expedited_sequence. */ smp_mb(); /* ^^^ */ - rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq); + + // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use + // in poll_state_synchronize_rcu_full() notwithstanding. Use of + // the latter here would result in too-short grace periods due to + // interactions with newly onlined CPUs. + rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq); rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 77efed89c79e..8d4895c854c5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -230,17 +230,19 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) * specified leaf rcu_node structure, which is acquired by the caller. */ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags, - unsigned long mask, bool wake) + unsigned long mask_in, bool wake) __releases(rnp->lock) { int cpu; + unsigned long mask; struct rcu_data *rdp; raw_lockdep_assert_held_rcu_node(rnp); - if (!(rnp->expmask & mask)) { + if (!(rnp->expmask & mask_in)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + mask = mask_in & rnp->expmask; WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 2605dd234a13..5ff3bc56ff51 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1557,8 +1557,11 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) /* Dump out nocb kthread state for the specified rcu_data structure. */ static void show_rcu_nocb_state(struct rcu_data *rdp) { - char bufw[20]; - char bufr[20]; + char bufd[22]; + char bufw[45]; + char bufr[45]; + char bufn[22]; + char bufb[22]; struct rcu_data *nocb_next_rdp; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; @@ -1572,9 +1575,13 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) typeof(*rdp), nocb_entry_rdp); - sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); - sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]); + sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL], + rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]); + sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass)); + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], @@ -1586,12 +1593,15 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) jiffies - rdp->nocb_nobypass_last, rdp->nocb_nobypass_count, ".D"[rcu_segcblist_ready_cbs(rsclp)], + rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd, ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn, ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb, rcu_segcblist_n_cbs(&rdp->cblist), rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3600152b858e..3c0bbbbb686f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -833,8 +833,17 @@ void rcu_read_unlock_strict(void) { struct rcu_data *rdp; - if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) return; + + /* + * rcu_report_qs_rdp() can only be invoked with a stable rdp and + * from the local CPU. + * + * The in_atomic_preempt_off() check ensures that we come here holding + * the last preempt_count (which will get dropped once we return to + * __rcu_read_unlock(). + */ rdp = this_cpu_ptr(&rcu_data); rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); @@ -975,13 +984,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) */ static void rcu_flavor_sched_clock_irq(int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && + (preempt_count() == HARDIRQ_OFFSET))) { /* * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. + * mode, from the idle loop without this being a nested + * interrupt, or while not holding the task preempt count + * (with PREEMPT_COUNT=y). In this case, the CPU is in a + * quiescent state, so note it. * * No memory barrier is required here because rcu_qs() * references only CPU-local variables that other CPUs diff --git a/kernel/reboot.c b/kernel/reboot.c index b5a8569e5d81..41ab9e1ba357 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -704,6 +704,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); + pr_flush(1000, true); kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); } diff --git a/kernel/rseq.c b/kernel/rseq.c index 442aba29bc4c..b7a1ec327e81 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -78,24 +78,24 @@ efault: return -EFAULT; } -static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, - u32 node_id, u32 mm_cid) -{ - rseq_kernel_fields(t)->cpu_id_start = cpu_id; - rseq_kernel_fields(t)->cpu_id = cpu_id; - rseq_kernel_fields(t)->node_id = node_id; - rseq_kernel_fields(t)->mm_cid = mm_cid; -} +/* + * Update an rseq field and its in-kernel copy in lock-step to keep a coherent + * state. + */ +#define rseq_unsafe_put_user(t, value, field, error_label) \ + do { \ + unsafe_put_user(value, &t->rseq->field, error_label); \ + rseq_kernel_fields(t)->field = value; \ + } while (0) + #else static int rseq_validate_ro_fields(struct task_struct *t) { return 0; } -static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, - u32 node_id, u32 mm_cid) -{ -} +#define rseq_unsafe_put_user(t, value, field, error_label) \ + unsafe_put_user(value, &t->rseq->field, error_label) #endif /* @@ -173,17 +173,18 @@ static int rseq_update_cpu_node_id(struct task_struct *t) WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; - unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); - unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_put_user(node_id, &rseq->node_id, efault_end); - unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); + + rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); + rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); + rseq_unsafe_put_user(t, node_id, node_id, efault_end); + rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); + /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); - rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid); trace_rseq_update(t); return 0; @@ -195,6 +196,7 @@ efault: static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { + struct rseq __user *rseq = t->rseq; u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; @@ -202,40 +204,61 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) * Validate read-only rseq fields. */ if (rseq_validate_ro_fields(t)) - return -EFAULT; - /* - * Reset cpu_id_start to its initial state (0). - */ - if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) - return -EFAULT; - /* - * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming - * in after unregistration can figure out that rseq needs to be - * registered again. - */ - if (put_user(cpu_id, &t->rseq->cpu_id)) - return -EFAULT; - /* - * Reset node_id to its initial state (0). - */ - if (put_user(node_id, &t->rseq->node_id)) - return -EFAULT; + goto efault; + + if (!user_write_access_begin(rseq, t->rseq_len)) + goto efault; + /* - * Reset mm_cid to its initial state (0). + * Reset all fields to their initial state. + * + * All fields have an initial state of 0 except cpu_id which is set to + * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after + * unregistration can figure out that rseq needs to be registered + * again. */ - if (put_user(mm_cid, &t->rseq->mm_cid)) - return -EFAULT; - - rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid); + rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); + rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); + rseq_unsafe_put_user(t, node_id, node_id, efault_end); + rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if * t->rseq_len != ORIG_RSEQ_SIZE. */ + user_write_access_end(); + return 0; + +efault_end: + user_write_access_end(); +efault: + return -EFAULT; +} + +/* + * Get the user-space pointer value stored in the 'rseq_cs' field. + */ +static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) +{ + if (!rseq_cs) + return -EFAULT; + +#ifdef CONFIG_64BIT + if (get_user(*rseq_cs, &rseq->rseq_cs)) + return -EFAULT; +#else + if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) + return -EFAULT; +#endif + return 0; } +/* + * If the rseq_cs field of 'struct rseq' contains a valid pointer to + * user-space, copy 'struct rseq_cs' from user-space and validate its fields. + */ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; @@ -244,17 +267,16 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) u32 sig; int ret; -#ifdef CONFIG_64BIT - if (get_user(ptr, &t->rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) - return -EFAULT; -#endif + ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); + if (ret) + return ret; + + /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } + /* Check that the pointer value fits in the user-space process space. */ if (ptr >= TASK_SIZE) return -EINVAL; urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; @@ -330,7 +352,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) return !!event_mask; } -static int clear_rseq_cs(struct task_struct *t) +static int clear_rseq_cs(struct rseq __user *rseq) { /* * The rseq_cs field is set to NULL on preemption or signal @@ -341,9 +363,9 @@ static int clear_rseq_cs(struct task_struct *t) * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT - return put_user(0UL, &t->rseq->rseq_cs); + return put_user(0UL, &rseq->rseq_cs); #else - if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) + if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) return -EFAULT; return 0; #endif @@ -375,11 +397,11 @@ static int rseq_ip_fixup(struct pt_regs *regs) * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t); + return clear_rseq_cs(t->rseq); ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) return ret; - ret = clear_rseq_cs(t); + ret = clear_rseq_cs(t->rseq); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, @@ -453,6 +475,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; + u64 rseq_cs; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) @@ -507,9 +530,19 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; - current->rseq = rseq; - current->rseq_len = rseq_len; - current->rseq_sig = sig; + + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. + */ + if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) + return -EFAULT; + if (rseq_cs && clear_rseq_cs(rseq)) + return -EFAULT; + #ifdef CONFIG_DEBUG_RSEQ /* * Initialize the in-kernel rseq fields copy for validation of @@ -522,6 +555,14 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return -EFAULT; #endif /* + * Activate the registration by setting the rseq area address, length + * and signature in the task struct. + */ + current->rseq = rseq; + current->rseq_len = rseq_len; + current->rseq_sig = sig; + + /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 976092b7bd45..8ae86371ddcd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -22,6 +22,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_build_policy.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_build_utility.o += -DDISABLE_BRANCH_PROFILING +endif # # Build efficiency: # diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 83d46b9b8ec8..2b331822c7e7 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -150,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p) * see this thread after that: we can no longer use signal->autogroup. * See the PF_EXITING check in task_wants_autogroup(). */ - sched_move_task(p); + sched_move_task(p, true); } static void @@ -182,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * sched_autogroup_exit_task(). */ for_each_thread(p, t) - sched_move_task(t); + sched_move_task(t, true); unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index fae1f5c921eb..72d97aa8b726 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -61,6 +61,7 @@ #ifdef CONFIG_SCHED_CLASS_EXT # include "ext.c" +# include "ext_idle.c" #endif #include "syscalls.c" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 80a3df49ab47..bf9d8db94b70 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -68,9 +68,7 @@ # include "cpufreq_schedutil.c" #endif -#ifdef CONFIG_SCHED_DEBUG -# include "debug.c" -#endif +#include "debug.c" #ifdef CONFIG_SCHEDSTATS # include "stats.c" diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 165c90ba64ea..cfaca3040b2f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,7 +91,6 @@ #include "autogroup.h" #include "pelt.h" #include "smp.h" -#include "stats.h" #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" @@ -119,7 +118,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits * @@ -129,7 +127,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); */ #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | -const_debug unsigned int sysctl_sched_features = +__read_mostly unsigned int sysctl_sched_features = #include "features.h" 0; #undef SCHED_FEAT @@ -143,13 +141,12 @@ const_debug unsigned int sysctl_sched_features = */ __read_mostly int sysctl_resched_latency_warn_ms = 100; __read_mostly int sysctl_resched_latency_warn_once = 1; -#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ -const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; +__read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; __read_mostly int scheduler_running; @@ -491,6 +488,16 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } #endif /* CONFIG_SCHED_CORE */ +/* need a wrapper since we may need to trace from modules */ +EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); + +/* Call via the helper macro trace_set_current_state. */ +void __trace_set_current_state(int state_value) +{ + trace_sched_set_state_tp(current, state_value); +} +EXPORT_SYMBOL(__trace_set_current_state); + /* * Serialization rules: * @@ -800,11 +807,10 @@ void update_rq_clock(struct rq *rq) if (rq->clock_update_flags & RQCF_ACT_SKIP) return; -#ifdef CONFIG_SCHED_DEBUG if (sched_feat(WARN_DOUBLE_CLOCK)) - SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); + WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED); rq->clock_update_flags |= RQCF_UPDATED; -#endif + clock = sched_clock_cpu(cpu_of(rq)); scx_rq_clock_update(rq, clock); @@ -916,8 +922,7 @@ static void hrtick_rq_init(struct rq *rq) #ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - rq->hrtick_timer.function = hrtick; + hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -1063,9 +1068,10 @@ void wake_up_q(struct wake_q_head *head) struct task_struct *task; task = container_of(node, struct task_struct, wake_q); - /* Task can safely be re-inserted now: */ node = node->next; - task->wake_q.next = NULL; + /* pairs with cmpxchg_relaxed() in __wake_q_add() */ + WRITE_ONCE(task->wake_q.next, NULL); + /* Task can safely be re-inserted now. */ /* * wake_up_process() executes a full barrier, which pairs with @@ -1719,7 +1725,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, bucket = &uc_rq->bucket[uc_se->bucket_id]; - SCHED_WARN_ON(!bucket->tasks); + WARN_ON_ONCE(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; @@ -1739,7 +1745,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, * Defensive programming: this should never happen. If it happens, * e.g. due to future modification, warn and fix up the expected value. */ - SCHED_WARN_ON(bucket->value > rq_clamp); + WARN_ON_ONCE(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) { bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); uclamp_rq_set(rq, clamp_id, bkt_clamp); @@ -1756,7 +1762,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled. */ - if (!static_branch_unlikely(&sched_uclamp_used)) + if (!uclamp_is_used()) return; if (unlikely(!p->sched_class->uclamp_enabled)) @@ -1783,7 +1789,7 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled. */ - if (!static_branch_unlikely(&sched_uclamp_used)) + if (!uclamp_is_used()) return; if (unlikely(!p->sched_class->uclamp_enabled)) @@ -1941,12 +1947,12 @@ static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write, } if (update_root_tg) { - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); uclamp_update_root_tg(); } if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); uclamp_sync_util_min_rt_default(); } @@ -2121,7 +2127,7 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - SCHED_WARN_ON(flags & DEQUEUE_SLEEP); + WARN_ON_ONCE(flags & DEQUEUE_SLEEP); WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ASSERT_EXCLUSIVE_WRITER(p->on_rq); @@ -2726,7 +2732,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) * XXX do further audits, this smells like something putrid. */ if (ctx->flags & SCA_MIGRATE_DISABLE) - SCHED_WARN_ON(!p->on_cpu); + WARN_ON_ONCE(!p->on_cpu); else lockdep_assert_held(&p->pi_lock); @@ -3291,7 +3297,6 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { -#ifdef CONFIG_SCHED_DEBUG unsigned int state = READ_ONCE(p->__state); /* @@ -3329,7 +3334,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(!cpu_online(new_cpu)); WARN_ON_ONCE(is_migration_disabled(p)); -#endif trace_sched_migrate_task(p, new_cpu); @@ -3921,13 +3925,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { - /* - * The BPF scheduler may depend on select_task_rq() being invoked during - * wakeups. In addition, @p may end up executing on a different CPU - * regardless of what happens in the wakeup path making the ttwu_queue - * optimization less meaningful. Skip if on SCX. - */ - if (task_on_scx(p)) + /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ + if (!scx_allow_ttwu_queue(p)) return false; /* @@ -4195,7 +4194,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); if (!ttwu_state_match(p, state, &success)) goto out; @@ -4489,7 +4488,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_LIST_HEAD(&p->se.group_node); /* A delayed task cannot be in clone(). */ - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -5306,6 +5305,12 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ finish_task_switch(prev); + /* + * This is a special case: the newly created task has just + * switched the context for the first time. It is returning from + * schedule for the first time in this path. + */ + trace_sched_exit_tp(true, CALLER_ADDR0); preempt_enable(); if (current->set_child_tid) @@ -5577,7 +5582,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_SCHED_DEBUG static u64 cpu_resched_latency(struct rq *rq) { int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); @@ -5622,9 +5626,6 @@ static int __init setup_resched_latency_warn_ms(char *str) return 1; } __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); -#else -static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } -#endif /* CONFIG_SCHED_DEBUG */ /* * This function gets called by the timer code, with HZ frequency. @@ -5745,7 +5746,7 @@ static void sched_tick_remote(struct work_struct *work) * we are always sure that there is no proxy (only a * single task is running). */ - SCHED_WARN_ON(rq->curr != rq->donor); + WARN_ON_ONCE(rq->curr != rq->donor); update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -5965,7 +5966,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); - SCHED_WARN_ON(ct_state() == CT_STATE_USER); + WARN_ON_ONCE(ct_state() == CT_STATE_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -6649,12 +6650,15 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; + bool is_switch = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; + trace_sched_entry_tp(preempt, CALLER_ADDR0); + cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; @@ -6718,11 +6722,10 @@ static void __sched notrace __schedule(int sched_mode) picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); -#ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; -#endif - if (likely(prev != next)) { + is_switch = prev != next; + if (likely(is_switch)) { rq->nr_switches++; /* * RCU users of rcu_dereference(rq->curr) may not see @@ -6767,6 +6770,7 @@ picked: __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); } + trace_sched_exit_tp(is_switch, CALLER_ADDR0); } void __noreturn do_task_dead(void) @@ -6811,7 +6815,7 @@ static inline void sched_submit_work(struct task_struct *tsk) * deadlock if the callback attempts to acquire a lock which is * already acquired. */ - SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); + WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT); /* * If we are going to sleep and we have plugged IO queued, @@ -7094,7 +7098,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); + WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -7284,12 +7288,12 @@ out_unlock: #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) { - if (should_resched(0)) { + if (should_resched(0) && !irqs_disabled()) { preempt_schedule_common(); return 1; } /* - * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick * whether the current CPU is in an RCU read-side critical section, * so the tick can report quiescent states even for CPUs looping * in kernel context. In contrast, in non-preemptible kernels, @@ -7298,6 +7302,8 @@ int __sched __cond_resched(void) * RCU quiescent state. Therefore, the following code causes * cond_resched() to report a quiescent state, but only when RCU * is in urgent need of one. + * A third case, preemptible, but non-PREEMPT_RCU provides for + * urgently needed quiescent states via rcu_flavor_sched_clock_irq(). */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); @@ -7646,10 +7652,57 @@ PREEMPT_MODEL_ACCESSOR(lazy); #else /* !CONFIG_PREEMPT_DYNAMIC: */ +#define preempt_dynamic_mode -1 + static inline void preempt_dynamic_init(void) { } #endif /* CONFIG_PREEMPT_DYNAMIC */ +const char *preempt_modes[] = { + "none", "voluntary", "full", "lazy", NULL, +}; + +const char *preempt_model_str(void) +{ + bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) && + (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) || + IS_ENABLED(CONFIG_PREEMPT_LAZY)); + static char buf[128]; + + if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) { + struct seq_buf s; + + seq_buf_init(&s, buf, sizeof(buf)); + seq_buf_puts(&s, "PREEMPT"); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + seq_buf_printf(&s, "%sRT%s", + brace ? "_{" : "_", + brace ? "," : ""); + + if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { + seq_buf_printf(&s, "(%s)%s", + preempt_dynamic_mode > 0 ? + preempt_modes[preempt_dynamic_mode] : "undef", + brace ? "}" : ""); + return seq_buf_str(&s); + } + + if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + seq_buf_printf(&s, "LAZY%s", + brace ? "}" : ""); + return seq_buf_str(&s); + } + + return seq_buf_str(&s); + } + + if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD)) + return "VOLUNTARY"; + + return "NONE"; +} + int io_schedule_prepare(void) { int old_iowait = current->in_iowait; @@ -7764,10 +7817,9 @@ void show_state_filter(unsigned int state_filter) sched_show_task(p); } -#ifdef CONFIG_SCHED_DEBUG if (!state_filter) sysrq_sched_debug_show(); -#endif + rcu_read_unlock(); /* * Only show locks if all tasks are dumped: @@ -8182,7 +8234,7 @@ static void cpuset_cpu_active(void) * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - partition_sched_domains(1, NULL, NULL); + cpuset_reset_sched_domains(); if (--num_cpus_frozen) return; /* @@ -8201,7 +8253,7 @@ static void cpuset_cpu_inactive(unsigned int cpu) cpuset_update_active_cpus(); } else { num_cpus_frozen++; - partition_sched_domains(1, NULL, NULL); + cpuset_reset_sched_domains(); } } @@ -8423,9 +8475,9 @@ void __init sched_init_smp(void) * CPU masks are stable and all blatant races in the below code cannot * happen. */ - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); sched_init_domains(cpu_active_mask); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) @@ -9015,7 +9067,7 @@ void sched_release_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -static struct task_group *sched_get_task_group(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk) { struct task_group *tg; @@ -9027,13 +9079,7 @@ static struct task_group *sched_get_task_group(struct task_struct *tsk) tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); - - return tg; -} - -static void sched_change_group(struct task_struct *tsk, struct task_group *group) -{ - tsk->sched_task_group = group; + tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) @@ -9050,24 +9096,15 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect * its new group. */ -void sched_move_task(struct task_struct *tsk) +void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - struct task_group *group; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - /* - * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous - * group changes. - */ - group = sched_get_task_group(tsk); - if (group == tsk->sched_task_group) - return; - update_rq_clock(rq); running = task_current_donor(rq, tsk); @@ -9078,8 +9115,9 @@ void sched_move_task(struct task_struct *tsk) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk, group); - scx_move_task(tsk); + sched_change_group(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -9180,7 +9218,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) - sched_move_task(task); + sched_move_task(task, false); scx_cgroup_finish_attach(); } @@ -9201,7 +9239,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) unsigned int clamps; lockdep_assert_held(&uclamp_mutex); - SCHED_WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent @@ -9293,7 +9331,7 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); guard(mutex)(&uclamp_mutex); guard(rcu)(); @@ -10536,7 +10574,7 @@ static void task_mm_cid_work(struct callback_head *work) struct mm_struct *mm; int weight, cpu; - SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); + WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); work->next = work; /* Prevent double-add */ if (t->flags & PF_EXITING) diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 1ef98a93eb1d..c4606ca89210 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -65,7 +65,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, * a cookie until after we've removed it, we must have core scheduling * enabled here. */ - SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); + WARN_ON_ONCE((p->core_cookie || cookie) && !sched_core_enabled(rq)); if (sched_core_enqueued(p)) sched_core_dequeue(rq, p, DEQUEUE_SAVE); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5d9143dd0879..6dab4854c6c0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -9,8 +9,6 @@ #ifdef CONFIG_IRQ_TIME_ACCOUNTING -DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); - /* * There are no locks covering percpu hardirq/softirq time. * They are only modified in vtime_account, on corresponding CPU @@ -24,14 +22,16 @@ DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); +int sched_clock_irqtime; + void enable_sched_clock_irqtime(void) { - static_branch_enable(&sched_clock_irqtime); + sched_clock_irqtime = 1; } void disable_sched_clock_irqtime(void) { - static_branch_disable(&sched_clock_irqtime); + sched_clock_irqtime = 0; } static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 38e4537790af..ad45a8fea245 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -166,14 +166,14 @@ static inline unsigned long dl_bw_capacity(int i) } } -static inline bool dl_bw_visited(int cpu, u64 gen) +bool dl_bw_visited(int cpu, u64 cookie) { struct root_domain *rd = cpu_rq(cpu)->rd; - if (rd->visit_gen == gen) + if (rd->visit_cookie == cookie) return true; - rd->visit_gen = gen; + rd->visit_cookie = cookie; return false; } @@ -207,7 +207,7 @@ static inline unsigned long dl_bw_capacity(int i) return SCHED_CAPACITY_SCALE; } -static inline bool dl_bw_visited(int cpu, u64 gen) +bool dl_bw_visited(int cpu, u64 cookie) { return false; } @@ -249,8 +249,8 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw += dl_bw; - SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + WARN_ON_ONCE(dl_rq->running_bw < old); /* overflow */ + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); /* kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } @@ -262,7 +262,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw -= dl_bw; - SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ + WARN_ON_ONCE(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) dl_rq->running_bw = 0; /* kick cpufreq (see the comment in kernel/sched/sched.h). */ @@ -276,7 +276,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw += dl_bw; - SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ + WARN_ON_ONCE(dl_rq->this_bw < old); /* overflow */ } static inline @@ -286,10 +286,10 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw -= dl_bw; - SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ + WARN_ON_ONCE(dl_rq->this_bw > old); /* underflow */ if (dl_rq->this_bw > old) dl_rq->this_bw = 0; - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); } static inline @@ -1382,8 +1382,7 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = dl_task_timer; + hrtimer_setup(timer, dl_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } /* @@ -1839,8 +1838,7 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = inactive_task_timer; + hrtimer_setup(timer, inactive_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #define __node_2_dle(node) \ @@ -2956,7 +2954,7 @@ void dl_add_task_root_domain(struct task_struct *p) struct dl_bw *dl_b; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - if (!dl_task(p)) { + if (!dl_task(p) || dl_entity_is_special(&p->dl)) { raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return; } @@ -2981,18 +2979,22 @@ void dl_clear_root_domain(struct root_domain *rd) rd->dl_bw.total_bw = 0; /* - * dl_server bandwidth is only restored when CPUs are attached to root - * domains (after domains are created or CPUs moved back to the - * default root doamin). + * dl_servers are not tasks. Since dl_add_task_root_domain ignores + * them, we need to account for them here explicitly. */ for_each_cpu(i, rd->span) { struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; if (dl_server(dl_se) && cpu_active(i)) - rd->dl_bw.total_bw += dl_se->dl_bw; + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i)); } } +void dl_clear_root_domain_cpu(int cpu) +{ + dl_clear_root_domain(cpu_rq(cpu)->rd); +} + #endif /* CONFIG_SMP */ static void switched_from_dl(struct rq *rq, struct task_struct *p) @@ -3171,15 +3173,18 @@ DEFINE_SCHED_CLASS(dl) = { #endif }; -/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ -static u64 dl_generation; +/* + * Used for dl_bw check and update, used under sched_rt_handler()::mutex and + * sched_domains_mutex. + */ +u64 dl_cookie; int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); - u64 gen = ++dl_generation; + u64 cookie = ++dl_cookie; struct dl_bw *dl_b; int cpu, cpus, ret = 0; unsigned long flags; @@ -3189,10 +3194,10 @@ int sched_dl_global_validate(void) * value smaller than the currently allocated bandwidth in * any of the root_domains. */ - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { rcu_read_lock_sched(); - if (dl_bw_visited(cpu, gen)) + if (dl_bw_visited(cpu, cookie)) goto next; dl_b = dl_bw_of(cpu); @@ -3229,7 +3234,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) void sched_dl_do_global(void) { u64 new_bw = -1; - u64 gen = ++dl_generation; + u64 cookie = ++dl_cookie; struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -3240,7 +3245,7 @@ void sched_dl_do_global(void) for_each_possible_cpu(cpu) { rcu_read_lock_sched(); - if (dl_bw_visited(cpu, gen)) { + if (dl_bw_visited(cpu, cookie)) { rcu_read_unlock_sched(); continue; } @@ -3567,9 +3572,7 @@ void dl_bw_free(int cpu, u64 dl_bw) } #endif -#ifdef CONFIG_SCHED_DEBUG void print_dl_stats(struct seq_file *m, int cpu) { print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); } -#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index fd7e85220715..56ae54e0ce6a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -244,11 +244,13 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { - static const char * preempt_modes[] = { - "none", "voluntary", "full", "lazy", - }; - int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; + int j; + + /* Count entries in NULL terminated preempt_modes */ + for (j = 0; preempt_modes[j]; j++) + ; + j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); for (; i < j; i++) { if (preempt_dynamic_mode == i) @@ -292,7 +294,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, bool orig; cpus_read_lock(); - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); orig = sched_debug_verbose; result = debugfs_write_file_bool(filp, ubuf, cnt, ppos); @@ -304,7 +306,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, sd_dentry = NULL; } - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); cpus_read_unlock(); return result; @@ -515,9 +517,9 @@ static __init int sched_init_debug(void) debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); update_sched_domain_debugfs(); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); #endif #ifdef CONFIG_NUMA_BALANCING @@ -1262,6 +1264,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, if (task_has_dl_policy(p)) { P(dl.runtime); P(dl.deadline); + } else if (fair_policy(p->policy)) { + P(se.slice); } #ifdef CONFIG_SCHED_CLASS_EXT __PS("ext.enabled", task_on_scx(p)); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8857c0709bdd..21575d39c376 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6,6 +6,9 @@ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> * Copyright (c) 2022 David Vernet <dvernet@meta.com> */ +#include <linux/btf_ids.h> +#include "ext_idle.h" + #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { @@ -93,7 +96,7 @@ enum scx_ops_flags { /* * Keep built-in idle tracking even if ops.update_idle() is implemented. */ - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, /* * By default, if there are no other task to run on the CPU, ext core @@ -101,7 +104,7 @@ enum scx_ops_flags { * flag is specified, such tasks are passed to ops.enqueue() with * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. */ - SCX_OPS_ENQ_LAST = 1LLU << 1, + SCX_OPS_ENQ_LAST = 1LLU << 1, /* * An exiting task may schedule after PF_EXITING is set. In such cases, @@ -114,13 +117,48 @@ enum scx_ops_flags { * depend on pid lookups and wants to handle these tasks directly, the * following flag can be used. */ - SCX_OPS_ENQ_EXITING = 1LLU << 2, + SCX_OPS_ENQ_EXITING = 1LLU << 2, /* * If set, only tasks with policy set to SCHED_EXT are attached to * sched_ext. If clear, SCHED_NORMAL tasks are also included. */ - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + + /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes + * ops.enqueue() on the ops.select_cpu() selected or the wakee's + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline + * transfers. When this optimization is enabled, ops.select_cpu() is + * skipped in some cases (when racing against the wakee switching out). + * As the BPF scheduler may depend on ops.select_cpu() being invoked + * during wakeups, queued wakeup is disabled by default. + * + * If this ops flag is set, queued wakeup optimization is enabled and + * the BPF scheduler must be able to handle ops.enqueue() invoked on the + * wakee's CPU without preceding ops.select_cpu() even for tasks which + * may be executed on multiple CPUs. + */ + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, + + /* + * If set, enable per-node idle cpumasks. If clear, use a single global + * flat idle cpumask. + */ + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, /* * CPU cgroup support flags @@ -130,7 +168,10 @@ enum scx_ops_flags { SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | SCX_OPS_HAS_CGROUP_WEIGHT, }; @@ -416,7 +457,7 @@ struct sched_ext_ops { /** * @update_idle: Update the idle state of a CPU - * @cpu: CPU to udpate the idle state for + * @cpu: CPU to update the idle state for * @idle: whether entering or exiting the idle state * * This operation is called when @rq's CPU goes or leaves the idle @@ -765,6 +806,7 @@ enum scx_deq_flags { enum scx_pick_idle_cpu_flags { SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ }; enum scx_kick_flags { @@ -880,15 +922,11 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all); static struct sched_ext_ops scx_ops; static bool scx_warned_zero_slice; +DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); -static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); - -#ifdef CONFIG_SMP -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); -#endif static struct static_key_false scx_has_op[SCX_OPI_END] = { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; @@ -923,21 +961,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; static struct delayed_work scx_watchdog_work; -/* idle tracking */ -#ifdef CONFIG_SMP -#ifdef CONFIG_CPUMASK_OFFSTACK -#define CL_ALIGNED_IF_ONSTACK -#else -#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp -#endif - -static struct { - cpumask_var_t cpu; - cpumask_var_t smt; -} idle_masks CL_ALIGNED_IF_ONSTACK; - -#endif /* CONFIG_SMP */ - /* for %SCX_KICK_WAIT */ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; @@ -1214,7 +1237,7 @@ static bool scx_kf_allowed_if_unlocked(void) /** * nldsq_next_task - Iterate to the next task in a non-local DSQ - * @dsq: user dsq being interated + * @dsq: user dsq being iterated * @cur: current position, %NULL to start iteration * @rev: walk backwards * @@ -1458,6 +1481,117 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return p; } +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + s64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + s64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + s64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + + /* + * The total number of tasks enqueued (or pick_task-ed) with a + * default time slice (SCX_SLICE_DFL). + */ + s64 SCX_EV_ENQ_SLICE_DFL; + + /* + * The total duration of bypass modes in nanoseconds. + */ + s64 SCX_EV_BYPASS_DURATION; + + /* + * The number of tasks dispatched in the bypassing mode. + */ + s64 SCX_EV_BYPASS_DISPATCH; + + /* + * The number of times the bypassing mode has been activated. + */ + s64 SCX_EV_BYPASS_ACTIVATE; +}; + +/* + * The event counter is organized by a per-CPU variable to minimize the + * accounting overhead without synchronization. A system-wide view on the + * event counter is constructed when requested by scx_bpf_get_event_stat(). + */ +static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); + +/** + * scx_add_event - Increase an event counter for 'name' by 'cnt' + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This can be used when preemption is not disabled. + */ +#define scx_add_event(name, cnt) do { \ + this_cpu_add(event_stats_cpu.name, cnt); \ + trace_sched_ext_event(#name, cnt); \ +} while(0) + +/** + * __scx_add_event - Increase an event counter for 'name' by 'cnt' + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This should be used only when preemption is disabled. + */ +#define __scx_add_event(name, cnt) do { \ + __this_cpu_add(event_stats_cpu.name, cnt); \ + trace_sched_ext_event(#name, cnt); \ +} while(0) + +/** + * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' + * @dst_e: destination event stats + * @src_e: source event stats + * @kind: a kind of event to be aggregated + */ +#define scx_agg_event(dst_e, src_e, kind) do { \ + (dst_e)->kind += READ_ONCE((src_e)->kind); \ +} while(0) + +/** + * scx_dump_event - Dump an event 'kind' in 'events' to 's' + * @s: output seq_buf + * @events: event stats + * @kind: a kind of event to dump + */ +#define scx_dump_event(s, events, kind) do { \ + dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ +} while (0) + + +static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz); + static enum scx_ops_enable_state scx_ops_enable_state(void) { return atomic_read(&scx_ops_enable_state_var); @@ -2003,16 +2137,27 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (scx_rq_bypassing(rq)) + if (scx_rq_bypassing(rq)) { + __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); goto global; + } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct; /* see %SCX_OPS_ENQ_EXITING */ if (!static_branch_unlikely(&scx_ops_enq_exiting) && - unlikely(p->flags & PF_EXITING)) + unlikely(p->flags & PF_EXITING)) { + __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1); + goto local; + } + + /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ + if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && + is_migration_disabled(p)) { + __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); goto local; + } if (!SCX_HAS_OP(enqueue)) goto global; @@ -2052,6 +2197,7 @@ local: */ touch_core_sched(rq, p); p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); local_norefill: dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); return; @@ -2059,6 +2205,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); dispatch_enqueue(find_global_dsq(p), p, enq_flags); } @@ -2078,7 +2225,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) /* * list_add_tail() must be used. scx_ops_bypass() depends on tasks being - * appened to the runnable_list. + * appended to the runnable_list. */ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); } @@ -2130,6 +2277,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags do_enqueue_task(rq, p, enq_flags, sticky_cpu); out: rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; + + if ((enq_flags & SCX_ENQ_CPU_SELECTED) && + unlikely(cpu_of(rq) != p->scx.selected_cpu)) + __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); } static void ops_dequeue(struct task_struct *p, u64 deq_flags) @@ -2313,12 +2464,35 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, * * - The BPF scheduler is bypassed while the rq is offline and we can always say * no to the BPF scheduler initiated migrations while offline. + * + * The caller must ensure that @p and @rq are on different CPUs. */ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, - bool trigger_error) + bool enforce) { int cpu = cpu_of(rq); + WARN_ON_ONCE(task_cpu(p) == cpu); + + /* + * If @p has migration disabled, @p->cpus_ptr is updated to contain only + * the pinned CPU in migrate_disable_switch() while @p is being switched + * out. However, put_prev_task_scx() is called before @p->cpus_ptr is + * updated and thus another CPU may see @p on a DSQ inbetween leading to + * @p passing the below task_allowed_on_cpu() check while migration is + * disabled. + * + * Test the migration disabled state first as the race window is narrow + * and the BPF scheduler failing to check migration disabled state can + * easily be masked if task_allowed_on_cpu() is done first. + */ + if (unlikely(is_migration_disabled(p))) { + if (enforce) + scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", + p->comm, p->pid, task_cpu(p), cpu); + return false; + } + /* * We don't require the BPF scheduler to avoid dispatching to offline * CPUs mostly for convenience but also because CPUs can go offline @@ -2326,17 +2500,17 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, * picked CPU is outside the allowed mask. */ if (!task_allowed_on_cpu(p, cpu)) { - if (trigger_error) - scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", - cpu_of(rq), p->comm, p->pid); + if (enforce) + scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", + cpu, p->comm, p->pid); return false; } - if (unlikely(is_migration_disabled(p))) - return false; - - if (!scx_rq_online(rq)) + if (!scx_rq_online(rq)) { + if (enforce) + __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; + } return true; } @@ -2406,7 +2580,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, } #else /* CONFIG_SMP */ static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } -static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } +static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; } static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ @@ -2437,7 +2611,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); - if (!task_can_run_on_remote_rq(p, dst_rq, true)) { + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dst_dsq = find_global_dsq(p); dst_rq = src_rq; } @@ -2480,7 +2655,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, /* * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artifical delays while the + * to the bypass mode can take a long time. Inject artificial delays while the * bypass mode is switching to guarantee timely completion. */ static void scx_ops_breather(struct rq *rq) @@ -2575,6 +2750,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, { struct rq *src_rq = task_rq(p); struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); +#ifdef CONFIG_SMP + struct rq *locked_rq = rq; +#endif /* * We're synchronized against dequeue through DISPATCHING. As @p can't @@ -2588,7 +2766,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } #ifdef CONFIG_SMP - if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dispatch_enqueue(find_global_dsq(p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; @@ -2611,8 +2790,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); /* switch to @src_rq lock */ - if (rq != src_rq) { - raw_spin_rq_unlock(rq); + if (locked_rq != src_rq) { + raw_spin_rq_unlock(locked_rq); + locked_rq = src_rq; raw_spin_rq_lock(src_rq); } @@ -2630,6 +2810,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } else { move_remote_task_to_local_dsq(p, enq_flags, src_rq, dst_rq); + /* task has been moved to dst_rq, which is now locked */ + locked_rq = dst_rq; } /* if the destination CPU is idle, wake it up */ @@ -2638,8 +2820,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } /* switch back to @rq lock */ - if (rq != dst_rq) { - raw_spin_rq_unlock(dst_rq); + if (locked_rq != rq) { + raw_spin_rq_unlock(locked_rq); raw_spin_rq_lock(rq); } #else /* CONFIG_SMP */ @@ -2845,6 +3027,7 @@ no_tasks: if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || scx_rq_bypassing(rq))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; + __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1); goto has_tasks; } rq->scx.flags &= ~SCX_RQ_IN_BALANCE; @@ -3069,7 +3252,6 @@ static struct task_struct *pick_task_scx(struct rq *rq) { struct task_struct *prev = rq->curr; struct task_struct *p; - bool prev_on_scx = prev->sched_class == &ext_sched_class; bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; bool kick_idle = false; @@ -3089,14 +3271,18 @@ static struct task_struct *pick_task_scx(struct rq *rq) * if pick_task_scx() is called without preceding balance_scx(). */ if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { - if (prev_on_scx) { + if (prev->scx.flags & SCX_TASK_QUEUED) { keep_prev = true; } else { keep_prev = false; kick_idle = true; } - } else if (unlikely(keep_prev && !prev_on_scx)) { - /* only allowed during transitions */ + } else if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { + /* + * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is + * conditional on scx_enabled() and may have been skipped. + */ WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); keep_prev = false; } @@ -3108,8 +3294,10 @@ static struct task_struct *pick_task_scx(struct rq *rq) */ if (keep_prev) { p = prev; - if (!p->scx.slice) + if (!p->scx.slice) { p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); + } } else { p = first_local_task(rq); if (!p) { @@ -3125,6 +3313,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); } } @@ -3144,7 +3333,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) * * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used * to implement the default task ordering. The older the timestamp, the higher - * prority the task - the global FIFO ordering matching the default scheduling + * priority the task - the global FIFO ordering matching the default scheduling * behavior. * * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to @@ -3169,418 +3358,10 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, #ifdef CONFIG_SMP -static bool test_and_clear_cpu_idle(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - /* - * SMT mask should be cleared whether we can claim @cpu or not. The SMT - * cluster is not wholly idle either way. This also prevents - * scx_pick_idle_cpu() from getting caught in an infinite loop. - */ - if (sched_smt_active()) { - const struct cpumask *smt = cpu_smt_mask(cpu); - - /* - * If offline, @cpu is not its own sibling and - * scx_pick_idle_cpu() can get caught in an infinite loop as - * @cpu is never cleared from idle_masks.smt. Ensure that @cpu - * is eventually cleared. - * - * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to - * reduce memory writes, which may help alleviate cache - * coherence pressure. - */ - if (cpumask_intersects(smt, idle_masks.smt)) - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - else if (cpumask_test_cpu(cpu, idle_masks.smt)) - __cpumask_clear_cpu(cpu, idle_masks.smt); - } -#endif - return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); -} - -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) -{ - int cpu; - -retry: - if (sched_smt_active()) { - cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); - if (cpu < nr_cpu_ids) - goto found; - - if (flags & SCX_PICK_IDLE_CORE) - return -EBUSY; - } - - cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); - if (cpu >= nr_cpu_ids) - return -EBUSY; - -found: - if (test_and_clear_cpu_idle(cpu)) - return cpu; - else - goto retry; -} - -/* - * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC - * domain is not defined). - */ -static unsigned int llc_weight(s32 cpu) -{ - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (!sd) - return 0; - - return sd->span_weight; -} - -/* - * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC - * domain is not defined). - */ -static struct cpumask *llc_span(s32 cpu) -{ - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (!sd) - return 0; - - return sched_domain_span(sd); -} - -/* - * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the - * NUMA domain is not defined). - */ -static unsigned int numa_weight(s32 cpu) -{ - struct sched_domain *sd; - struct sched_group *sg; - - sd = rcu_dereference(per_cpu(sd_numa, cpu)); - if (!sd) - return 0; - sg = sd->groups; - if (!sg) - return 0; - - return sg->group_weight; -} - -/* - * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA - * domain is not defined). - */ -static struct cpumask *numa_span(s32 cpu) -{ - struct sched_domain *sd; - struct sched_group *sg; - - sd = rcu_dereference(per_cpu(sd_numa, cpu)); - if (!sd) - return NULL; - sg = sd->groups; - if (!sg) - return NULL; - - return sched_group_span(sg); -} - -/* - * Return true if the LLC domains do not perfectly overlap with the NUMA - * domains, false otherwise. - */ -static bool llc_numa_mismatch(void) -{ - int cpu; - - /* - * We need to scan all online CPUs to verify whether their scheduling - * domains overlap. - * - * While it is rare to encounter architectures with asymmetric NUMA - * topologies, CPU hotplugging or virtualized environments can result - * in asymmetric configurations. - * - * For example: - * - * NUMA 0: - * - LLC 0: cpu0..cpu7 - * - LLC 1: cpu8..cpu15 [offline] - * - * NUMA 1: - * - LLC 0: cpu16..cpu23 - * - LLC 1: cpu24..cpu31 - * - * In this case, if we only check the first online CPU (cpu0), we might - * incorrectly assume that the LLC and NUMA domains are fully - * overlapping, which is incorrect (as NUMA 1 has two distinct LLC - * domains). - */ - for_each_online_cpu(cpu) - if (llc_weight(cpu) != numa_weight(cpu)) - return true; - - return false; -} - -/* - * Initialize topology-aware scheduling. - * - * Detect if the system has multiple LLC or multiple NUMA domains and enable - * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle - * selection policy. - * - * Assumption: the kernel's internal topology representation assumes that each - * CPU belongs to a single LLC domain, and that each LLC domain is entirely - * contained within a single NUMA node. - */ -static void update_selcpu_topology(void) -{ - bool enable_llc = false, enable_numa = false; - unsigned int nr_cpus; - s32 cpu = cpumask_first(cpu_online_mask); - - /* - * Enable LLC domain optimization only when there are multiple LLC - * domains among the online CPUs. If all online CPUs are part of a - * single LLC domain, the idle CPU selection logic can choose any - * online CPU without bias. - * - * Note that it is sufficient to check the LLC domain of the first - * online CPU to determine whether a single LLC domain includes all - * CPUs. - */ - rcu_read_lock(); - nr_cpus = llc_weight(cpu); - if (nr_cpus > 0) { - if (nr_cpus < num_online_cpus()) - enable_llc = true; - pr_debug("sched_ext: LLC=%*pb weight=%u\n", - cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); - } - - /* - * Enable NUMA optimization only when there are multiple NUMA domains - * among the online CPUs and the NUMA domains don't perfectly overlaps - * with the LLC domains. - * - * If all CPUs belong to the same NUMA node and the same LLC domain, - * enabling both NUMA and LLC optimizations is unnecessary, as checking - * for an idle CPU in the same domain twice is redundant. - */ - nr_cpus = numa_weight(cpu); - if (nr_cpus > 0) { - if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) - enable_numa = true; - pr_debug("sched_ext: NUMA=%*pb weight=%u\n", - cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); - } - rcu_read_unlock(); - - pr_debug("sched_ext: LLC idle selection %s\n", - str_enabled_disabled(enable_llc)); - pr_debug("sched_ext: NUMA idle selection %s\n", - str_enabled_disabled(enable_numa)); - - if (enable_llc) - static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); - else - static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); - if (enable_numa) - static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); - else - static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); -} - -/* - * Built-in CPU idle selection policy: - * - * 1. Prioritize full-idle cores: - * - always prioritize CPUs from fully idle cores (both logical CPUs are - * idle) to avoid interference caused by SMT. - * - * 2. Reuse the same CPU: - * - prefer the last used CPU to take advantage of cached data (L1, L2) and - * branch prediction optimizations. - * - * 3. Pick a CPU within the same LLC (Last-Level Cache): - * - if the above conditions aren't met, pick a CPU that shares the same LLC - * to maintain cache locality. - * - * 4. Pick a CPU within the same NUMA node, if enabled: - * - choose a CPU from the same NUMA node to reduce memory access latency. - * - * 5. Pick any idle CPU usable by the task. - * - * Step 3 and 4 are performed only if the system has, respectively, multiple - * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and - * scx_selcpu_topo_numa). - * - * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because - * we never call ops.select_cpu() for them, see select_task_rq(). - */ -static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *found) -{ - const struct cpumask *llc_cpus = NULL; - const struct cpumask *numa_cpus = NULL; - s32 cpu; - - *found = false; - - /* - * This is necessary to protect llc_cpus. - */ - rcu_read_lock(); - - /* - * Determine the scheduling domain only if the task is allowed to run - * on all CPUs. - * - * This is done primarily for efficiency, as it avoids the overhead of - * updating a cpumask every time we need to select an idle CPU (which - * can be costly in large SMP systems), but it also aligns logically: - * if a task's scheduling domain is restricted by user-space (through - * CPU affinity), the task will simply use the flat scheduling domain - * defined by user-space. - */ - if (p->nr_cpus_allowed >= num_possible_cpus()) { - if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) - numa_cpus = numa_span(prev_cpu); - - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) - llc_cpus = llc_span(prev_cpu); - } - - /* - * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. - */ - if (wake_flags & SCX_WAKE_SYNC) { - cpu = smp_processor_id(); - - /* - * If the waker's CPU is cache affine and prev_cpu is idle, - * then avoid a migration. - */ - if (cpus_share_cache(cpu, prev_cpu) && - test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * If the waker's local DSQ is empty, and the system is under - * utilized, try to wake up @p to the local DSQ of the waker. - * - * Checking only for an empty local DSQ is insufficient as it - * could give the wakee an unfair advantage when the system is - * oversaturated. - * - * Checking only for the presence of idle CPUs is also - * insufficient as the local DSQ of the waker could have tasks - * piled up on it even if there is an idle core elsewhere on - * the system. - */ - if (!cpumask_empty(idle_masks.cpu) && - !(current->flags & PF_EXITING) && - cpu_rq(cpu)->scx.local_dsq.nr == 0) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) - goto cpu_found; - } - } - - /* - * If CPU has SMT, any wholly idle CPU is likely a better pick than - * partially idle @prev_cpu. - */ - if (sched_smt_active()) { - /* - * Keep using @prev_cpu if it's part of a fully idle core. - */ - if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && - test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * Search for any fully idle core in the same LLC domain. - */ - if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any fully idle core in the same NUMA node. - */ - if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any full idle core usable by the task. - */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Use @prev_cpu if it's idle. - */ - if (test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * Search for any idle CPU in the same LLC domain. - */ - if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, 0); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any idle CPU in the same NUMA node. - */ - if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, 0); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any idle CPU usable by the task. - */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); - if (cpu >= 0) - goto cpu_found; - - rcu_read_unlock(); - return prev_cpu; - -cpu_found: - rcu_read_unlock(); - - *found = true; - return cpu; -} - static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { + bool rq_bypass; + /* * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it * can be a good migration opportunity with low cache and memory @@ -3594,7 +3375,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { + rq_bypass = scx_rq_bypassing(task_rq(p)); + if (SCX_HAS_OP(select_cpu) && !rq_bypass) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3604,20 +3386,27 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, select_cpu, p, prev_cpu, wake_flags); + p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; if (ops_cpu_valid(cpu, "from ops.select_cpu()")) return cpu; else return prev_cpu; } else { - bool found; s32 cpu; - cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); - if (found) { + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); + if (cpu >= 0) { p->scx.slice = SCX_SLICE_DFL; p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); + } else { + cpu = prev_cpu; } + p->scx.selected_cpu = cpu; + + if (rq_bypass) + __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); return cpu; } } @@ -3645,90 +3434,6 @@ static void set_cpus_allowed_scx(struct task_struct *p, (struct cpumask *)p->cpus_ptr); } -static void reset_idle_masks(void) -{ - /* - * Consider all online cpus idle. Should converge to the actual state - * quickly. - */ - cpumask_copy(idle_masks.cpu, cpu_online_mask); - cpumask_copy(idle_masks.smt, cpu_online_mask); -} - -static void update_builtin_idle(int cpu, bool idle) -{ - assign_cpu(cpu, idle_masks.cpu, idle); - -#ifdef CONFIG_SCHED_SMT - if (sched_smt_active()) { - const struct cpumask *smt = cpu_smt_mask(cpu); - - if (idle) { - /* - * idle_masks.smt handling is racy but that's fine as - * it's only for optimization and self-correcting. - */ - if (!cpumask_subset(smt, idle_masks.cpu)) - return; - cpumask_or(idle_masks.smt, idle_masks.smt, smt); - } else { - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - } - } -#endif -} - -/* - * Update the idle state of a CPU to @idle. - * - * If @do_notify is true, ops.update_idle() is invoked to notify the scx - * scheduler of an actual idle state transition (idle to busy or vice - * versa). If @do_notify is false, only the idle state in the idle masks is - * refreshed without invoking ops.update_idle(). - * - * This distinction is necessary, because an idle CPU can be "reserved" and - * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as - * busy even if no tasks are dispatched. In this case, the CPU may return - * to idle without a true state transition. Refreshing the idle masks - * without invoking ops.update_idle() ensures accurate idle state tracking - * while avoiding unnecessary updates and maintaining balanced state - * transitions. - */ -void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) -{ - int cpu = cpu_of(rq); - - lockdep_assert_rq_held(rq); - - /* - * Trigger ops.update_idle() only when transitioning from a task to - * the idle thread and vice versa. - * - * Idle transitions are indicated by do_notify being set to true, - * managed by put_prev_task_idle()/set_next_task_idle(). - */ - if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); - - /* - * Update the idle masks: - * - for real idle transitions (do_notify == true) - * - for idle-to-idle transitions (indicated by the previous task - * being the idle thread, managed by pick_task_idle()) - * - * Skip updating idle masks if the previous task is not the idle - * thread, since set_next_task_idle() has already handled it when - * transitioning from a task to the idle thread (calling this - * function with do_notify == true). - * - * In this way we can avoid updating the idle masks twice, - * unnecessarily. - */ - if (static_branch_likely(&scx_builtin_idle_enabled)) - if (do_notify || is_idle_task(rq->curr)) - update_builtin_idle(cpu, idle); -} - static void handle_hotplug(struct rq *rq, bool online) { int cpu = cpu_of(rq); @@ -3736,7 +3441,7 @@ static void handle_hotplug(struct rq *rq, bool online) atomic_long_inc(&scx_hotplug_seq); if (scx_enabled()) - update_selcpu_topology(); + scx_idle_update_selcpu_topology(&scx_ops); if (online && SCX_HAS_OP(cpu_online)) SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); @@ -3768,12 +3473,6 @@ static void rq_offline_scx(struct rq *rq) rq->scx.flags &= ~SCX_RQ_ONLINE; } -#else /* CONFIG_SMP */ - -static bool test_and_clear_cpu_idle(int cpu) { return false; } -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } -static void reset_idle_masks(void) {} - #endif /* CONFIG_SMP */ static bool check_rq_for_timeouts(struct rq *rq) @@ -3851,7 +3550,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) curr->scx.slice = 0; touch_core_sched(rq, curr); } else if (SCX_HAS_OP(tick)) { - SCX_CALL_OP(SCX_KF_REST, tick, curr); + SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); } if (!curr->scx.slice) @@ -3998,7 +3697,7 @@ static void scx_ops_disable_task(struct task_struct *p) WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); if (SCX_HAS_OP(disable)) - SCX_CALL_OP(SCX_KF_REST, disable, p); + SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); scx_set_task_state(p, SCX_TASK_READY); } @@ -4027,7 +3726,7 @@ static void scx_ops_exit_task(struct task_struct *p) } if (SCX_HAS_OP(exit_task)) - SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); + SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); scx_set_task_state(p, SCX_TASK_NONE); } @@ -4323,25 +4022,12 @@ err: return ops_sanitize_err("cgroup_prep_move", ret); } -void scx_move_task(struct task_struct *p) +void scx_cgroup_move_task(struct task_struct *p) { if (!scx_cgroup_enabled) return; /* - * We're called from sched_move_task() which handles both cgroup and - * autogroup moves. Ignore the latter. - * - * Also ignore exiting tasks, because in the exit path tasks transition - * from the autogroup to the root group, so task_group_is_autogroup() - * alone isn't able to catch exiting autogroup tasks. This is safe for - * cgroup_move(), because cgroup migrations never happen for PF_EXITING - * tasks. - */ - if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) - return; - - /* * @p must have ops.cgroup_prep_move() called on it and thus * cgrp_moving_from set. */ @@ -4590,7 +4276,7 @@ static int scx_cgroup_init(void) cgroup_warned_missing_idle = false; /* - * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and init, all online cgroups are initialized. */ rcu_read_lock(); @@ -4711,8 +4397,33 @@ static ssize_t scx_attr_ops_show(struct kobject *kobj, } SCX_ATTR(ops); +#define scx_attr_event_show(buf, at, events, kind) ({ \ + sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ +}) + +static ssize_t scx_attr_events_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct scx_event_stats events; + int at = 0; + + scx_bpf_events(&events, sizeof(events)); + at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); + return at; +} +SCX_ATTR(events); + static struct attribute *scx_sched_attrs[] = { &scx_attr_ops.attr, + &scx_attr_events.attr, NULL, }; ATTRIBUTE_GROUPS(scx_sched); @@ -4824,6 +4535,8 @@ static void scx_clear_softlockup(void) static void scx_ops_bypass(bool bypass) { static DEFINE_RAW_SPINLOCK(bypass_lock); + static unsigned long bypass_timestamp; + int cpu; unsigned long flags; @@ -4833,11 +4546,15 @@ static void scx_ops_bypass(bool bypass) WARN_ON_ONCE(scx_ops_bypass_depth <= 0); if (scx_ops_bypass_depth != 1) goto unlock; + bypass_timestamp = ktime_get_ns(); + scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1); } else { scx_ops_bypass_depth--; WARN_ON_ONCE(scx_ops_bypass_depth < 0); if (scx_ops_bypass_depth != 0) goto unlock; + scx_add_event(SCX_EV_BYPASS_DURATION, + ktime_get_ns() - bypass_timestamp); } atomic_inc(&scx_ops_breather_depth); @@ -5057,10 +4774,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work) static_branch_disable(&__scx_ops_enabled); for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) static_branch_disable(&scx_has_op[i]); + static_branch_disable(&scx_ops_allow_queued_wakeup); static_branch_disable(&scx_ops_enq_last); static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_enq_migration_disabled); static_branch_disable(&scx_ops_cpu_preempt); - static_branch_disable(&scx_builtin_idle_enabled); + scx_idle_disable(); synchronize_rcu(); if (ei->kind >= SCX_EXIT_ERROR) { @@ -5277,9 +4996,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); - dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu", - p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, - p->scx.dsq_vtime, p->scx.slice); + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", + p->scx.dsq_vtime, p->scx.slice, p->scx.weight); dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); if (SCX_HAS_OP(dump_task)) { @@ -5309,6 +5029,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) .at_jiffies = jiffies, }; struct seq_buf s; + struct scx_event_stats events; unsigned long flags; char *buf; int cpu; @@ -5417,6 +5138,21 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) rq_unlock(rq, &rf); } + dump_newline(&s); + dump_line(&s, "Event counters"); + dump_line(&s, "--------------"); + + scx_bpf_events(&events, sizeof(events)); + scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); + scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL); + scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); + scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); + scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); + if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), trunc_marker, sizeof(trunc_marker)); @@ -5506,6 +5242,16 @@ static int validate_ops(const struct sched_ext_ops *ops) return -EINVAL; } + /* + * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle + * selection policy to be enabled. + */ + if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && + (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); + return -EINVAL; + } + return 0; } @@ -5524,6 +5270,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) mutex_lock(&scx_ops_enable_mutex); + /* + * Clear event counters so a new scx scheduler gets + * fresh event counter values. + */ + for_each_possible_cpu(cpu) { + struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu); + memset(e, 0, sizeof(*e)); + } + if (!scx_ops_helper) { WRITE_ONCE(scx_ops_helper, scx_create_rt_helper("sched_ext_ops_helper")); @@ -5621,9 +5376,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_enable_cpuslocked(&scx_has_op[i]); check_hotplug_seq(ops); -#ifdef CONFIG_SMP - update_selcpu_topology(); -#endif + scx_idle_update_selcpu_topology(ops); + cpus_read_unlock(); ret = validate_ops(ops); @@ -5662,20 +5416,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) static_branch_enable(&scx_has_op[i]); + if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) + static_branch_enable(&scx_ops_allow_queued_wakeup); if (ops->flags & SCX_OPS_ENQ_LAST) static_branch_enable(&scx_ops_enq_last); - if (ops->flags & SCX_OPS_ENQ_EXITING) static_branch_enable(&scx_ops_enq_exiting); + if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) + static_branch_enable(&scx_ops_enq_migration_disabled); if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); - if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { - reset_idle_masks(); - static_branch_enable(&scx_builtin_idle_enabled); - } else { - static_branch_disable(&scx_builtin_idle_enabled); - } + scx_idle_enable(ops); /* * Lock out forks, cgroup on/offlining and moves before opening the @@ -6314,10 +6066,8 @@ void __init init_sched_ext_class(void) SCX_TG_ONLINE); BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); -#ifdef CONFIG_SMP - BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); -#endif + scx_idle_init_masks(); + scx_kick_cpus_pnt_seqs = __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, __alignof__(scx_kick_cpus_pnt_seqs[0])); @@ -6325,15 +6075,16 @@ void __init init_sched_ext_class(void) for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); + int n = cpu_to_node(cpu); init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); @@ -6350,62 +6101,6 @@ void __init init_sched_ext_class(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -#include <linux/btf_ids.h> - -__bpf_kfunc_start_defs(); - -static bool check_builtin_idle_enabled(void) -{ - if (static_branch_likely(&scx_builtin_idle_enabled)) - return true; - - scx_ops_error("built-in idle tracking is disabled"); - return false; -} - -/** - * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() - * @p: task_struct to select a CPU for - * @prev_cpu: CPU @p was on previously - * @wake_flags: %SCX_WAKE_* flags - * @is_idle: out parameter indicating whether the returned CPU is idle - * - * Can only be called from ops.select_cpu() if the built-in CPU selection is - * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). - * - * Returns the picked CPU with *@is_idle indicating whether the picked CPU is - * currently idle and thus a good candidate for direct dispatching. - */ -__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *is_idle) -{ - if (!check_builtin_idle_enabled()) - goto prev_cpu; - - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) - goto prev_cpu; - -#ifdef CONFIG_SMP - return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#endif - -prev_cpu: - *is_idle = false; - return prev_cpu; -} - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) -BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) - -static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { - .owner = THIS_MODULE, - .set = &scx_kfunc_ids_select_cpu, -}; - static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) { if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) @@ -6927,8 +6622,12 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. + * + * Also skip re-enqueueing tasks that can only run on this + * CPU, as they would just be re-added to the same local + * DSQ without any benefit. */ - if (p->migration_pending) + if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) continue; dispatch_dequeue(rq, p); @@ -7425,6 +7124,16 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) } /** + * scx_bpf_nr_node_ids - Return the number of possible node IDs + * + * All valid node IDs in the system are smaller than the returned value. + */ +__bpf_kfunc u32 scx_bpf_nr_node_ids(void) +{ + return nr_node_ids; +} + +/** * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs * * All valid CPU IDs in the system are smaller than the returned value. @@ -7465,142 +7174,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) } /** - * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking - * per-CPU cpumask. - * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. - */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) -{ - if (!check_builtin_idle_enabled()) - return cpu_none_mask; - -#ifdef CONFIG_SMP - return idle_masks.cpu; -#else - return cpu_none_mask; -#endif -} - -/** - * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, - * per-physical-core cpumask. Can be used to determine if an entire physical - * core is free. - * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. - */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) -{ - if (!check_builtin_idle_enabled()) - return cpu_none_mask; - -#ifdef CONFIG_SMP - if (sched_smt_active()) - return idle_masks.smt; - else - return idle_masks.cpu; -#else - return cpu_none_mask; -#endif -} - -/** - * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to - * either the percpu, or SMT idle-tracking cpumask. - * @idle_mask: &cpumask to use - */ -__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) -{ - /* - * Empty function body because we aren't actually acquiring or releasing - * a reference to a global idle cpumask, which is read-only in the - * caller and is never released. The acquire / release semantics here - * are just used to make the cpumask a trusted pointer in the caller. - */ -} - -/** - * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state - * @cpu: cpu to test and clear idle for - * - * Returns %true if @cpu was idle and its idle state was successfully cleared. - * %false otherwise. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) -{ - if (!check_builtin_idle_enabled()) - return false; - - if (ops_cpu_valid(cpu, NULL)) - return test_and_clear_cpu_idle(cpu); - else - return false; -} - -/** - * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu - * number on success. -%EBUSY if no matching cpu was found. - * - * Idle CPU tracking may race against CPU scheduling state transitions. For - * example, this function may return -%EBUSY as CPUs are transitioning into the - * idle state. If the caller then assumes that there will be dispatch events on - * the CPUs as they were all busy, the scheduler may end up stalling with CPUs - * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and - * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch - * event in the near future. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - if (!check_builtin_idle_enabled()) - return -EBUSY; - - return scx_pick_idle_cpu(cpus_allowed, flags); -} - -/** - * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any - * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu - * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is - * empty. - * - * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not - * set, this function can't tell which CPUs are idle and will always pick any - * CPU. - */ -__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - s32 cpu; - - if (static_branch_likely(&scx_builtin_idle_enabled)) { - cpu = scx_pick_idle_cpu(cpus_allowed, flags); - if (cpu >= 0) - return cpu; - } - - cpu = cpumask_any_distribute(cpus_allowed); - if (cpu < nr_cpu_ids) - return cpu; - else - return -EBUSY; -} - -/** * scx_bpf_task_running - Is task currently running? * @p: task of interest */ @@ -7720,6 +7293,43 @@ __bpf_kfunc u64 scx_bpf_now(void) return clock; } +/* + * scx_bpf_events - Get a system-wide event counter to + * @events: output buffer from a BPF program + * @events__sz: @events len, must end in '__sz'' for the verifier + */ +__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, + size_t events__sz) +{ + struct scx_event_stats e_sys, *e_cpu; + int cpu; + + /* Aggregate per-CPU event counters into the system-wide counters. */ + memset(&e_sys, 0, sizeof(e_sys)); + for_each_possible_cpu(cpu) { + e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); + scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); + scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE); + } + + /* + * We cannot entirely trust a BPF-provided size since a BPF program + * might be compiled against a different vmlinux.h, of which + * scx_event_stats would be larger (a newer vmlinux.h) or smaller + * (an older vmlinux.h). Hence, we use the smaller size to avoid + * memory corruption. + */ + events__sz = min(events__sz, sizeof(*events)); + memcpy(events, &e_sys, events__sz); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) @@ -7735,6 +7345,7 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) +BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) @@ -7752,6 +7363,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq) BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif BTF_ID_FLAGS(func, scx_bpf_now) +BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { @@ -7775,8 +7387,6 @@ static int __init scx_init(void) * check using scx_kf_allowed(). */ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, - &scx_kfunc_set_select_cpu)) || - (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_enqueue_dispatch)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_dispatch)) || @@ -7796,6 +7406,12 @@ static int __init scx_init(void) return ret; } + ret = scx_idle_init(); + if (ret) { + pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); + return ret; + } + ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); if (ret) { pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 4d022d17ac7d..1bda96b19a1b 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -8,6 +8,8 @@ */ #ifdef CONFIG_SCHED_CLASS_EXT +DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); + void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); @@ -34,6 +36,13 @@ static inline bool task_on_scx(const struct task_struct *p) return scx_enabled() && p->sched_class == &ext_sched_class; } +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) +{ + return !scx_enabled() || + static_branch_likely(&scx_ops_allow_queued_wakeup) || + p->sched_class != &ext_sched_class; +} + #ifdef CONFIG_SCHED_CORE bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi); @@ -52,6 +61,7 @@ static inline void scx_rq_activate(struct rq *rq) {} static inline void scx_rq_deactivate(struct rq *rq) {} static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } static inline bool task_on_scx(const struct task_struct *p) { return false; } +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; } static inline void init_sched_ext_class(void) {} #endif /* CONFIG_SCHED_CLASS_EXT */ @@ -73,7 +83,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); -void scx_move_task(struct task_struct *p); +void scx_cgroup_move_task(struct task_struct *p); void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); @@ -82,7 +92,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle); static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static inline void scx_move_task(struct task_struct *p) {} +static inline void scx_cgroup_move_task(struct task_struct *p) {} static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c new file mode 100644 index 000000000000..52c36a70a3d0 --- /dev/null +++ b/kernel/sched/ext_idle.c @@ -0,0 +1,1171 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Built-in idle CPU tracking policy. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> + */ +#include "ext_idle.h" + +/* Enable/disable built-in idle CPU selection policy */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + +/* Enable/disable per-node idle cpumasks */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node); + +#ifdef CONFIG_SMP +/* Enable/disable LLC aware optimizations */ +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); + +/* Enable/disable NUMA aware optimizations */ +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); + +/* + * cpumasks to track idle CPUs within each NUMA node. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask + * from is used to track all the idle CPUs in the system. + */ +struct scx_idle_cpus { + cpumask_var_t cpu; + cpumask_var_t smt; +}; + +/* + * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE + * is not enabled). + */ +static struct scx_idle_cpus scx_idle_global_masks; + +/* + * Per-node idle cpumasks. + */ +static struct scx_idle_cpus **scx_idle_node_masks; + +/* + * Return the idle masks associated to a target @node. + * + * NUMA_NO_NODE identifies the global idle cpumask. + */ +static struct scx_idle_cpus *idle_cpumask(int node) +{ + return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node]; +} + +/* + * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if + * per-node idle cpumasks are disabled. + */ +static int scx_cpu_node_if_enabled(int cpu) +{ + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +} + +bool scx_idle_test_and_clear_cpu(int cpu) +{ + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + +#ifdef CONFIG_SCHED_SMT + /* + * SMT mask should be cleared whether we can claim @cpu or not. The SMT + * cluster is not wholly idle either way. This also prevents + * scx_pick_idle_cpu() from getting caught in an infinite loop. + */ + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; + + /* + * If offline, @cpu is not its own sibling and + * scx_pick_idle_cpu() can get caught in an infinite loop as + * @cpu is never cleared from the idle SMT mask. Ensure that + * @cpu is eventually cleared. + * + * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to + * reduce memory writes, which may help alleviate cache + * coherence pressure. + */ + if (cpumask_intersects(smt, idle_smts)) + cpumask_andnot(idle_smts, idle_smts, smt); + else if (cpumask_test_cpu(cpu, idle_smts)) + __cpumask_clear_cpu(cpu, idle_smts); + } +#endif + + return cpumask_test_and_clear_cpu(cpu, idle_cpus); +} + +/* + * Pick an idle CPU in a specific NUMA node. + */ +static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + int cpu; + +retry: + if (sched_smt_active()) { + cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed); + if (cpu < nr_cpu_ids) + goto found; + + if (flags & SCX_PICK_IDLE_CORE) + return -EBUSY; + } + + cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed); + if (cpu >= nr_cpu_ids) + return -EBUSY; + +found: + if (scx_idle_test_and_clear_cpu(cpu)) + return cpu; + else + goto retry; +} + +/* + * Tracks nodes that have not yet been visited when searching for an idle + * CPU across all available nodes. + */ +static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited); + +/* + * Search for an idle CPU across all nodes, excluding @node. + */ +static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + nodemask_t *unvisited; + s32 cpu = -EBUSY; + + preempt_disable(); + unvisited = this_cpu_ptr(&per_cpu_unvisited); + + /* + * Restrict the search to the online nodes (excluding the current + * node that has been visited already). + */ + nodes_copy(*unvisited, node_states[N_ONLINE]); + node_clear(node, *unvisited); + + /* + * Traverse all nodes in order of increasing distance, starting + * from @node. + * + * This loop is O(N^2), with N being the amount of NUMA nodes, + * which might be quite expensive in large NUMA systems. However, + * this complexity comes into play only when a scheduler enables + * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU + * without specifying a target NUMA node, so it shouldn't be a + * bottleneck is most cases. + * + * As a future optimization we may want to cache the list of nodes + * in a per-node array, instead of actually traversing them every + * time. + */ + for_each_node_numadist(node, *unvisited) { + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + break; + } + preempt_enable(); + + return cpu; +} + +/* + * Find an idle CPU in the system, starting from @node. + */ +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + s32 cpu; + + /* + * Always search in the starting node first (this is an + * optimization that can save some cycles even when the search is + * not limited to a single node). + */ + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + /* + * Stop the search if we are using only a single global cpumask + * (NUMA_NO_NODE) or if the search is restricted to the first node + * only. + */ + if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE) + return -EBUSY; + + /* + * Extend the search to the other online nodes. + */ + return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags); +} + +/* + * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC + * domain is not defined). + */ +static unsigned int llc_weight(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sd->span_weight; +} + +/* + * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC + * domain is not defined). + */ +static struct cpumask *llc_span(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sched_domain_span(sd); +} + +/* + * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the + * NUMA domain is not defined). + */ +static unsigned int numa_weight(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return 0; + sg = sd->groups; + if (!sg) + return 0; + + return sg->group_weight; +} + +/* + * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA + * domain is not defined). + */ +static struct cpumask *numa_span(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return NULL; + sg = sd->groups; + if (!sg) + return NULL; + + return sched_group_span(sg); +} + +/* + * Return true if the LLC domains do not perfectly overlap with the NUMA + * domains, false otherwise. + */ +static bool llc_numa_mismatch(void) +{ + int cpu; + + /* + * We need to scan all online CPUs to verify whether their scheduling + * domains overlap. + * + * While it is rare to encounter architectures with asymmetric NUMA + * topologies, CPU hotplugging or virtualized environments can result + * in asymmetric configurations. + * + * For example: + * + * NUMA 0: + * - LLC 0: cpu0..cpu7 + * - LLC 1: cpu8..cpu15 [offline] + * + * NUMA 1: + * - LLC 0: cpu16..cpu23 + * - LLC 1: cpu24..cpu31 + * + * In this case, if we only check the first online CPU (cpu0), we might + * incorrectly assume that the LLC and NUMA domains are fully + * overlapping, which is incorrect (as NUMA 1 has two distinct LLC + * domains). + */ + for_each_online_cpu(cpu) + if (llc_weight(cpu) != numa_weight(cpu)) + return true; + + return false; +} + +/* + * Initialize topology-aware scheduling. + * + * Detect if the system has multiple LLC or multiple NUMA domains and enable + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle + * selection policy. + * + * Assumption: the kernel's internal topology representation assumes that each + * CPU belongs to a single LLC domain, and that each LLC domain is entirely + * contained within a single NUMA node. + */ +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) +{ + bool enable_llc = false, enable_numa = false; + unsigned int nr_cpus; + s32 cpu = cpumask_first(cpu_online_mask); + + /* + * Enable LLC domain optimization only when there are multiple LLC + * domains among the online CPUs. If all online CPUs are part of a + * single LLC domain, the idle CPU selection logic can choose any + * online CPU without bias. + * + * Note that it is sufficient to check the LLC domain of the first + * online CPU to determine whether a single LLC domain includes all + * CPUs. + */ + rcu_read_lock(); + nr_cpus = llc_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus()) + enable_llc = true; + pr_debug("sched_ext: LLC=%*pb weight=%u\n", + cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); + } + + /* + * Enable NUMA optimization only when there are multiple NUMA domains + * among the online CPUs and the NUMA domains don't perfectly overlaps + * with the LLC domains. + * + * If all CPUs belong to the same NUMA node and the same LLC domain, + * enabling both NUMA and LLC optimizations is unnecessary, as checking + * for an idle CPU in the same domain twice is redundant. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA + * optimization, as we would naturally select idle CPUs within + * specific NUMA nodes querying the corresponding per-node cpumask. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + nr_cpus = numa_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) + enable_numa = true; + pr_debug("sched_ext: NUMA=%*pb weight=%u\n", + cpumask_pr_args(numa_span(cpu)), nr_cpus); + } + } + rcu_read_unlock(); + + pr_debug("sched_ext: LLC idle selection %s\n", + str_enabled_disabled(enable_llc)); + pr_debug("sched_ext: NUMA idle selection %s\n", + str_enabled_disabled(enable_numa)); + + if (enable_llc) + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); + if (enable_numa) + static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); +} + +/* + * Built-in CPU idle selection policy: + * + * 1. Prioritize full-idle cores: + * - always prioritize CPUs from fully idle cores (both logical CPUs are + * idle) to avoid interference caused by SMT. + * + * 2. Reuse the same CPU: + * - prefer the last used CPU to take advantage of cached data (L1, L2) and + * branch prediction optimizations. + * + * 3. Pick a CPU within the same LLC (Last-Level Cache): + * - if the above conditions aren't met, pick a CPU that shares the same LLC + * to maintain cache locality. + * + * 4. Pick a CPU within the same NUMA node, if enabled: + * - choose a CPU from the same NUMA node to reduce memory access latency. + * + * 5. Pick any idle CPU usable by the task. + * + * Step 3 and 4 are performed only if the system has, respectively, + * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and + * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs. + * + * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always + * begin in @prev_cpu's node and proceed to other nodes in order of + * increasing distance. + * + * Return the picked CPU if idle, or a negative value otherwise. + * + * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because + * we never call ops.select_cpu() for them, see select_task_rq(). + */ +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags) +{ + const struct cpumask *llc_cpus = NULL; + const struct cpumask *numa_cpus = NULL; + int node = scx_cpu_node_if_enabled(prev_cpu); + s32 cpu; + + /* + * This is necessary to protect llc_cpus. + */ + rcu_read_lock(); + + /* + * Determine the scheduling domain only if the task is allowed to run + * on all CPUs. + * + * This is done primarily for efficiency, as it avoids the overhead of + * updating a cpumask every time we need to select an idle CPU (which + * can be costly in large SMP systems), but it also aligns logically: + * if a task's scheduling domain is restricted by user-space (through + * CPU affinity), the task will simply use the flat scheduling domain + * defined by user-space. + */ + if (p->nr_cpus_allowed >= num_possible_cpus()) { + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) + numa_cpus = numa_span(prev_cpu); + + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) + llc_cpus = llc_span(prev_cpu); + } + + /* + * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. + */ + if (wake_flags & SCX_WAKE_SYNC) { + int waker_node; + + /* + * If the waker's CPU is cache affine and prev_cpu is idle, + * then avoid a migration. + */ + cpu = smp_processor_id(); + if (cpus_share_cache(cpu, prev_cpu) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * If the waker's local DSQ is empty, and the system is under + * utilized, try to wake up @p to the local DSQ of the waker. + * + * Checking only for an empty local DSQ is insufficient as it + * could give the wakee an unfair advantage when the system is + * oversaturated. + * + * Checking only for the presence of idle CPUs is also + * insufficient as the local DSQ of the waker could have tasks + * piled up on it even if there is an idle core elsewhere on + * the system. + */ + waker_node = cpu_to_node(cpu); + if (!(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0 && + (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && + !cpumask_empty(idle_cpumask(waker_node)->cpu)) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) + goto out_unlock; + } + } + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (sched_smt_active()) { + /* + * Keep using @prev_cpu if it's part of a fully idle core. + */ + if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * Search for any fully idle core in the same LLC domain. + */ + if (llc_cpus) { + cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any fully idle core in the same NUMA node. + */ + if (numa_cpus) { + cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any full-idle core usable by the task. + * + * If the node-aware idle CPU selection policy is enabled + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always + * begin in prev_cpu's node and proceed to other nodes in + * order of increasing distance. + */ + cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + + /* + * Give up if we're strictly looking for a full-idle SMT + * core. + */ + if (flags & SCX_PICK_IDLE_CORE) { + cpu = prev_cpu; + goto out_unlock; + } + } + + /* + * Use @prev_cpu if it's idle. + */ + if (scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * Search for any idle CPU in the same LLC domain. + */ + if (llc_cpus) { + cpu = pick_idle_cpu_in_node(llc_cpus, node, 0); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any idle CPU in the same NUMA node. + */ + if (numa_cpus) { + cpu = pick_idle_cpu_in_node(numa_cpus, node, 0); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any idle CPU usable by the task. + * + * If the node-aware idle CPU selection policy is enabled + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin + * in prev_cpu's node and proceed to other nodes in order of + * increasing distance. + */ + cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags); + if (cpu >= 0) + goto out_unlock; + +out_unlock: + rcu_read_unlock(); + + return cpu; +} + +/* + * Initialize global and per-node idle cpumasks. + */ +void scx_idle_init_masks(void) +{ + int node; + + /* Allocate global idle cpumasks */ + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL)); + + /* Allocate per-node idle cpumasks */ + scx_idle_node_masks = kcalloc(num_possible_nodes(), + sizeof(*scx_idle_node_masks), GFP_KERNEL); + BUG_ON(!scx_idle_node_masks); + + for_each_node(node) { + scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks), + GFP_KERNEL, node); + BUG_ON(!scx_idle_node_masks[node]); + + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node)); + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node)); + } +} + +static void update_builtin_idle(int cpu, bool idle) +{ + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + + assign_cpu(cpu, idle_cpus, idle); + +#ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; + + if (idle) { + /* + * idle_smt handling is racy but that's fine as it's + * only for optimization and self-correcting. + */ + if (!cpumask_subset(smt, idle_cpus)) + return; + cpumask_or(idle_smts, idle_smts, smt); + } else { + cpumask_andnot(idle_smts, idle_smts, smt); + } + } +#endif +} + +/* + * Update the idle state of a CPU to @idle. + * + * If @do_notify is true, ops.update_idle() is invoked to notify the scx + * scheduler of an actual idle state transition (idle to busy or vice + * versa). If @do_notify is false, only the idle state in the idle masks is + * refreshed without invoking ops.update_idle(). + * + * This distinction is necessary, because an idle CPU can be "reserved" and + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as + * busy even if no tasks are dispatched. In this case, the CPU may return + * to idle without a true state transition. Refreshing the idle masks + * without invoking ops.update_idle() ensures accurate idle state tracking + * while avoiding unnecessary updates and maintaining balanced state + * transitions. + */ +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + */ + if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + + /* + * Update the idle masks: + * - for real idle transitions (do_notify == true) + * - for idle-to-idle transitions (indicated by the previous task + * being the idle thread, managed by pick_task_idle()) + * + * Skip updating idle masks if the previous task is not the idle + * thread, since set_next_task_idle() has already handled it when + * transitioning from a task to the idle thread (calling this + * function with do_notify == true). + * + * In this way we can avoid updating the idle masks twice, + * unnecessarily. + */ + if (static_branch_likely(&scx_builtin_idle_enabled)) + if (do_notify || is_idle_task(rq->curr)) + update_builtin_idle(cpu, idle); +} + +static void reset_idle_masks(struct sched_ext_ops *ops) +{ + int node; + + /* + * Consider all online cpus idle. Should converge to the actual state + * quickly. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask); + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask); + return; + } + + for_each_node(node) { + const struct cpumask *node_mask = cpumask_of_node(node); + + cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask); + cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); + } +} +#endif /* CONFIG_SMP */ + +void scx_idle_enable(struct sched_ext_ops *ops) +{ + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) + static_branch_enable(&scx_builtin_idle_enabled); + else + static_branch_disable(&scx_builtin_idle_enabled); + + if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) + static_branch_enable(&scx_builtin_idle_per_node); + else + static_branch_disable(&scx_builtin_idle_per_node); + +#ifdef CONFIG_SMP + reset_idle_masks(ops); +#endif +} + +void scx_idle_disable(void) +{ + static_branch_disable(&scx_builtin_idle_enabled); + static_branch_disable(&scx_builtin_idle_per_node); +} + +/******************************************************************************** + * Helpers that can be called from the BPF scheduler. + */ + +static int validate_node(int node) +{ + if (!static_branch_likely(&scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is disabled"); + return -EOPNOTSUPP; + } + + /* Return no entry for NUMA_NO_NODE (not a critical scx error) */ + if (node == NUMA_NO_NODE) + return -ENOENT; + + /* Make sure node is in a valid range */ + if (node < 0 || node >= nr_node_ids) { + scx_ops_error("invalid node %d", node); + return -EINVAL; + } + + /* Make sure the node is part of the set of possible nodes */ + if (!node_possible(node)) { + scx_ops_error("unavailable node %d", node); + return -EINVAL; + } + + return node; +} + +__bpf_kfunc_start_defs(); + +static bool check_builtin_idle_enabled(void) +{ + if (static_branch_likely(&scx_builtin_idle_enabled)) + return true; + + scx_ops_error("built-in idle tracking is disabled"); + return false; +} + +/** + * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or + * trigger an error if @cpu is invalid + * @cpu: target CPU + */ +__bpf_kfunc int scx_bpf_cpu_node(s32 cpu) +{ +#ifdef CONFIG_NUMA + if (!ops_cpu_valid(cpu, NULL)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +#else + return 0; +#endif +} + +/** + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @is_idle: out parameter indicating whether the returned CPU is idle + * + * Can only be called from ops.select_cpu() if the built-in CPU selection is + * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. + * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * + * Returns the picked CPU with *@is_idle indicating whether the picked CPU is + * currently idle and thus a good candidate for direct dispatching. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *is_idle) +{ +#ifdef CONFIG_SMP + s32 cpu; +#endif + if (!ops_cpu_valid(prev_cpu, NULL)) + goto prev_cpu; + + if (!check_builtin_idle_enabled()) + goto prev_cpu; + + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) + goto prev_cpu; + +#ifdef CONFIG_SMP + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); + if (cpu >= 0) { + *is_idle = true; + return cpu; + } +#endif + +prev_cpu: + *is_idle = false; + return prev_cpu; +} + +/** + * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the + * idle-tracking per-CPU cpumask of a target NUMA node. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_ops_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking + * per-CPU cpumask. + * + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) +{ + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(NUMA_NO_NODE)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the + * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be + * used to determine if an entire physical core is free. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_ops_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(node)->smt; + else + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, + * per-physical-core cpumask. Can be used to determine if an entire physical + * core is free. + * + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) +{ + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(NUMA_NO_NODE)->smt; + else + return idle_cpumask(NUMA_NO_NODE)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to + * either the percpu, or SMT idle-tracking cpumask. + * @idle_mask: &cpumask to use + */ +__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) +{ + /* + * Empty function body because we aren't actually acquiring or releasing + * a reference to a global idle cpumask, which is read-only in the + * caller and is never released. The acquire / release semantics here + * are just used to make the cpumask a trusted pointer in the caller. + */ +} + +/** + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state + * @cpu: cpu to test and clear idle for + * + * Returns %true if @cpu was idle and its idle state was successfully cleared. + * %false otherwise. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +{ + if (!check_builtin_idle_enabled()) + return false; + + if (ops_cpu_valid(cpu, NULL)) + return scx_idle_test_and_clear_cpu(cpu); + else + return false; +} + +/** + * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_* flags + * + * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node. + * + * Returns the picked idle cpu number on success, or -%EBUSY if no matching + * cpu was found. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node). + * + * Always returns an error if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if + * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + node = validate_node(node); + if (node < 0) + return node; + + return scx_pick_idle_cpu(cpus_allowed, node, flags); +} + +/** + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu + * number on success. -%EBUSY if no matching cpu was found. + * + * Idle CPU tracking may race against CPU scheduling state transitions. For + * example, this function may return -%EBUSY as CPUs are transitioning into the + * idle state. If the caller then assumes that there will be dispatch events on + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch + * event in the near future. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_idle_cpu_node() instead. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is enabled"); + return -EBUSY; + } + + if (!check_builtin_idle_enabled()) + return -EBUSY; + + return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); +} + +/** + * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available + * or pick any CPU from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node, regardless of + * the CPU idle state). + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + s32 cpu; + + node = validate_node(node); + if (node < 0) + return node; + + cpu = scx_pick_idle_cpu(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + if (flags & SCX_PICK_IDLE_IN_NODE) + cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed); + else + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +/** + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_any_cpu_node() instead. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + s32 cpu; + + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is enabled"); + return -EBUSY; + } + + if (static_branch_likely(&scx_builtin_idle_enabled)) { + cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); + if (cpu >= 0) + return cpu; + } + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_idle) +BTF_ID_FLAGS(func, scx_bpf_cpu_node) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_idle) + +static const struct btf_kfunc_id_set scx_kfunc_set_idle = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_idle, +}; + +BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) + +static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_select_cpu, +}; + +int scx_idle_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); + + return ret; +} diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h new file mode 100644 index 000000000000..511cc2221f7a --- /dev/null +++ b/kernel/sched/ext_idle.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> + */ +#ifndef _KERNEL_SCHED_EXT_IDLE_H +#define _KERNEL_SCHED_EXT_IDLE_H + +struct sched_ext_ops; + +#ifdef CONFIG_SMP +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); +void scx_idle_init_masks(void); +bool scx_idle_test_and_clear_cpu(int cpu); +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags); +#else /* !CONFIG_SMP */ +static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} +static inline void scx_idle_init_masks(void) {} +static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } +static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + return -EBUSY; +} +#endif /* CONFIG_SMP */ + +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags); +void scx_idle_enable(struct sched_ext_ops *ops); +void scx_idle_disable(void); +int scx_idle_init(void); + +#endif /* _KERNEL_SCHED_EXT_IDLE_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ce2e94ccad0c..e43993a4e580 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -74,12 +74,12 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_base_slice = 750000ULL; -static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; +unsigned int sysctl_sched_base_slice = 700000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; -const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; static int __init setup_sched_thermal_decay_shift(char *str) { @@ -399,7 +399,7 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) static inline void assert_list_leaf_cfs_rq(struct rq *rq) { - SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); + WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); } /* Iterate through all leaf cfs_rq's on a runqueue */ @@ -696,7 +696,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { s64 vlag, limit; - SCHED_WARN_ON(!se->on_rq); + WARN_ON_ONCE(!se->on_rq); vlag = avg_vruntime(cfs_rq) - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); @@ -884,6 +884,26 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) } /* + * HACK, stash a copy of deadline at the point of pick in vlag, + * which isn't used until dequeue. + */ +static inline void set_protect_slice(struct sched_entity *se) +{ + se->vlag = se->deadline; +} + +static inline bool protect_slice(struct sched_entity *se) +{ + return se->vlag == se->deadline; +} + +static inline void cancel_protect_slice(struct sched_entity *se) +{ + if (protect_slice(se)) + se->vlag = se->deadline + 1; +} + +/* * Earliest Eligible Virtual Deadline First * * In order to provide latency guarantees for different request sizes @@ -919,11 +939,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - /* - * Once selected, run a task until it either becomes non-eligible or - * until it gets a new slice. See the HACK in set_next_entity(). - */ - if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) + if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -967,7 +983,6 @@ found: return best; } -#ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -994,7 +1009,6 @@ int sched_update_scaling(void) return 0; } #endif -#endif static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -3301,7 +3315,7 @@ static void task_numa_work(struct callback_head *work) bool vma_pids_skipped; bool vma_pids_forced = false; - SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); work->next = work; /* @@ -4020,7 +4034,7 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa) * Make sure that rounding and/or propagation of PELT values never * break this. */ - SCHED_WARN_ON(sa->load_avg || + WARN_ON_ONCE(sa->load_avg || sa->util_avg || sa->runnable_avg); @@ -4045,15 +4059,17 @@ static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) { struct cfs_rq *prev_cfs_rq; struct list_head *prev; + struct rq *rq = rq_of(cfs_rq); if (cfs_rq->on_list) { prev = cfs_rq->leaf_cfs_rq_list.prev; } else { - struct rq *rq = rq_of(cfs_rq); - prev = rq->tmp_alone_branch; } + if (prev == &rq->leaf_cfs_rq_list) + return false; + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); return (prev_cfs_rq->tg->parent == cfs_rq->tg); @@ -5385,6 +5401,15 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void set_delayed(struct sched_entity *se) { se->sched_delayed = 1; + + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since dequeue_entities() + * will account it for blocked tasks. + */ + if (!entity_is_task(se)) + return; + for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -5397,6 +5422,16 @@ static void set_delayed(struct sched_entity *se) static void clear_delayed(struct sched_entity *se) { se->sched_delayed = 0; + + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since a dequeue has + * already accounted for it or an enqueue of a task + * below it will account for it in enqueue_task_fair(). + */ + if (!entity_is_task(se)) + return; + for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -5423,7 +5458,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) clear_buddies(cfs_rq, se); if (flags & DEQUEUE_DELAYED) { - SCHED_WARN_ON(!se->sched_delayed); + WARN_ON_ONCE(!se->sched_delayed); } else { bool delay = sleep; /* @@ -5433,7 +5468,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_SPECIAL) delay = false; - SCHED_WARN_ON(delay && se->sched_delayed); + WARN_ON_ONCE(delay && se->sched_delayed); if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { @@ -5509,15 +5544,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - /* - * HACK, stash a copy of deadline at the point of pick in vlag, - * which isn't used until dequeue. - */ - se->vlag = se->deadline; + + set_protect_slice(se); } update_stats_curr_start(cfs_rq, se); - SCHED_WARN_ON(cfs_rq->curr); + WARN_ON_ONCE(cfs_rq->curr); cfs_rq->curr = se; /* @@ -5558,7 +5590,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) if (sched_feat(PICK_BUDDY) && cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { /* ->next will never be delayed */ - SCHED_WARN_ON(cfs_rq->next->sched_delayed); + WARN_ON_ONCE(cfs_rq->next->sched_delayed); return cfs_rq->next; } @@ -5594,7 +5626,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* in !on_rq case, update occurred at dequeue */ update_load_avg(cfs_rq, prev, 0); } - SCHED_WARN_ON(cfs_rq->curr != prev); + WARN_ON_ONCE(cfs_rq->curr != prev); cfs_rq->curr = NULL; } @@ -5817,7 +5849,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttled_clock_self = 0; - if (SCHED_WARN_ON((s64)delta < 0)) + if (WARN_ON_ONCE((s64)delta < 0)) delta = 0; cfs_rq->throttled_clock_self_time += delta; @@ -5837,7 +5869,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); - SCHED_WARN_ON(cfs_rq->throttled_clock_self); + WARN_ON_ONCE(cfs_rq->throttled_clock_self); if (cfs_rq->nr_queued) cfs_rq->throttled_clock_self = rq_clock(rq); } @@ -5946,7 +5978,7 @@ done: * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; - SCHED_WARN_ON(cfs_rq->throttled_clock); + WARN_ON_ONCE(cfs_rq->throttled_clock); if (cfs_rq->nr_queued) cfs_rq->throttled_clock = rq_clock(rq); return true; @@ -6102,7 +6134,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) } /* Already enqueued */ - if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) + if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list))) return; first = list_empty(&rq->cfsb_csd_list); @@ -6121,7 +6153,7 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { lockdep_assert_rq_held(rq_of(cfs_rq)); - if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || + if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) || cfs_rq->runtime_remaining <= 0)) return; @@ -6157,7 +6189,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) goto next; /* By the above checks, this should never be true */ - SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + WARN_ON_ONCE(cfs_rq->runtime_remaining > 0); raw_spin_lock(&cfs_b->lock); runtime = -cfs_rq->runtime_remaining + 1; @@ -6178,7 +6210,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) * We currently only expect to be unthrottling * a single cfs_rq locally. */ - SCHED_WARN_ON(!list_empty(&local_unthrottle)); + WARN_ON_ONCE(!list_empty(&local_unthrottle)); list_add_tail(&cfs_rq->throttled_csd_list, &local_unthrottle); } @@ -6203,7 +6235,7 @@ next: rq_unlock_irqrestore(rq, &rf); } - SCHED_WARN_ON(!list_empty(&local_unthrottle)); + WARN_ON_ONCE(!list_empty(&local_unthrottle)); rcu_read_unlock(); @@ -6520,14 +6552,14 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); - cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); /* Add a random offset so that timers interleave */ hrtimer_set_expires(&cfs_b->period_timer, get_random_u32_below(cfs_b->period)); - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->slack_timer.function = sched_cfs_slack_timer; + hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); cfs_b->slack_started = false; } @@ -6755,7 +6787,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - SCHED_WARN_ON(task_rq(p) != rq); + WARN_ON_ONCE(task_rq(p) != rq); if (rq->cfs.h_nr_queued > 1) { u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; @@ -6866,8 +6898,8 @@ requeue_delayed_entity(struct sched_entity *se) * Because a delayed entity is one that is still on * the runqueue competing until elegibility. */ - SCHED_WARN_ON(!se->sched_delayed); - SCHED_WARN_ON(!se->on_rq); + WARN_ON_ONCE(!se->sched_delayed); + WARN_ON_ONCE(!se->on_rq); if (sched_feat(DELAY_ZERO)) { update_entity_lag(cfs_rq, se); @@ -6970,6 +7002,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_group(se); se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); cfs_rq->h_nr_runnable += h_nr_runnable; @@ -7099,6 +7133,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) update_cfs_group(se); se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); cfs_rq->h_nr_runnable -= h_nr_runnable; @@ -7123,8 +7159,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) rq->next_balance = jiffies; if (p && task_delayed) { - SCHED_WARN_ON(!task_sleep); - SCHED_WARN_ON(p->on_rq != 1); + WARN_ON_ONCE(!task_sleep); + WARN_ON_ONCE(p->on_rq != 1); /* Fix-up what dequeue_task_fair() skipped */ hrtick_update(rq); @@ -8702,7 +8738,7 @@ static inline void set_task_max_allowed_capacity(struct task_struct *p) {} static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { - if (SCHED_WARN_ON(!se->on_rq)) + if (WARN_ON_ONCE(!se->on_rq)) return; if (se_is_idle(se)) return; @@ -8762,8 +8798,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Preempt an idle entity in favor of a non-idle entity (and don't preempt * in the inverse case). */ - if (cse_is_idle && !pse_is_idle) + if (cse_is_idle && !pse_is_idle) { + /* + * When non-idle entity preempt an idle entity, + * don't give idle entity slice protection. + */ + cancel_protect_slice(se); goto preempt; + } + if (cse_is_idle != pse_is_idle) return; @@ -8782,8 +8825,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Note that even if @p does not turn out to be the most eligible * task at this moment, current's slice protection will be lost. */ - if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) - se->vlag = se->deadline + 1; + if (do_preempt_short(cfs_rq, pse, se)) + cancel_protect_slice(se); /* * If @p has become the most eligible task, force preemption. @@ -9396,12 +9439,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; /* Prevent to re-select dst_cpu via env's CPUs: */ - for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) { - env->flags |= LBF_DST_PINNED; - env->new_dst_cpu = cpu; - break; - } + cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr); + + if (cpu < nr_cpu_ids) { + env->flags |= LBF_DST_PINNED; + env->new_dst_cpu = cpu; } return 0; @@ -12440,7 +12482,7 @@ unlock: void nohz_balance_exit_idle(struct rq *rq) { - SCHED_WARN_ON(rq != this_rq()); + WARN_ON_ONCE(rq != this_rq()); if (likely(!rq->nohz_tick_stopped)) return; @@ -12476,7 +12518,7 @@ void nohz_balance_enter_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - SCHED_WARN_ON(cpu != smp_processor_id()); + WARN_ON_ONCE(cpu != smp_processor_id()); /* If this CPU is going down, then nothing needs to be done: */ if (!cpu_active(cpu)) @@ -12559,7 +12601,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) int balance_cpu; struct rq *rq; - SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); + WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); /* * We assume there will be no idle load after this update and clear @@ -12999,7 +13041,7 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, struct cfs_rq *cfs_rqb; s64 delta; - SCHED_WARN_ON(task_rq(b)->core != rq->core); + WARN_ON_ONCE(task_rq(b)->core != rq->core); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -13202,7 +13244,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) static void switched_to_fair(struct rq *rq, struct task_struct *p) { - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); attach_task_cfs_rq(p); @@ -13237,7 +13279,7 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs if (!first) return; - SCHED_WARN_ON(se->sched_delayed); + WARN_ON_ONCE(se->sched_delayed); if (hrtick_enabled_fair(rq)) hrtick_start_fair(rq, p); @@ -13624,7 +13666,6 @@ DEFINE_SCHED_CLASS(fair) = { #endif }; -#ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq, *pos; @@ -13658,7 +13699,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) rcu_read_unlock(); } #endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_SCHED_DEBUG */ __init void init_sched_fair_class(void) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4b8e33c615b1..fa03ec3ed56a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -127,9 +127,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) raw_spin_lock_init(&rt_b->rt_runtime_lock); - hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_HARD); - rt_b->rt_period_timer.function = sched_rt_period_timer; + hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); } static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) @@ -169,9 +168,8 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_SCHED_DEBUG WARN_ON_ONCE(!rt_entity_is_task(rt_se)); -#endif + return container_of(rt_se, struct task_struct, rt); } @@ -1713,7 +1711,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) BUG_ON(idx >= MAX_RT_PRIO); queue = array->queue + idx; - if (SCHED_WARN_ON(list_empty(queue))) + if (WARN_ON_ONCE(list_empty(queue))) return NULL; next = list_entry(queue->next, struct sched_rt_entity, run_list); @@ -2910,6 +2908,7 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff int ret; mutex_lock(&mutex); + sched_domains_mutex_lock(); old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; @@ -2936,6 +2935,7 @@ undo: sysctl_sched_rt_period = old_period; sysctl_sched_rt_runtime = old_runtime; } + sched_domains_mutex_unlock(); mutex_unlock(&mutex); return ret; @@ -2967,7 +2967,6 @@ static int sched_rr_handler(const struct ctl_table *table, int write, void *buff } #endif /* CONFIG_SYSCTL */ -#ifdef CONFIG_SCHED_DEBUG void print_rt_stats(struct seq_file *m, int cpu) { rt_rq_iter_t iter; @@ -2978,4 +2977,3 @@ void print_rt_stats(struct seq_file *m, int cpu) print_rt_rq(m, cpu, rt_rq); rcu_read_unlock(); } -#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 38e0e323dda2..47972f34ea70 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -91,12 +91,6 @@ struct cpuidle_state; #include "cpupri.h" #include "cpudeadline.h" -#ifdef CONFIG_SCHED_DEBUG -# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -#else -# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) -#endif - /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -572,7 +566,7 @@ extern void sched_online_group(struct task_group *tg, extern void sched_destroy_group(struct task_group *tg); extern void sched_release_group(struct task_group *tg); -extern void sched_move_task(struct task_struct *tsk); +extern void sched_move_task(struct task_struct *tsk, bool for_autogroup); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); @@ -998,7 +992,7 @@ struct root_domain { * Also, some corner cases, like 'wrap around' is dangerous, but given * that u64 is 'big enough'. So that shouldn't be a concern. */ - u64 visit_gen; + u64 visit_cookie; #ifdef HAVE_RT_PUSH_IPI /* @@ -1180,10 +1174,8 @@ struct rq { atomic_t nr_iowait; -#ifdef CONFIG_SCHED_DEBUG u64 last_seen_need_resched_ns; int ticks_without_resched; -#endif #ifdef CONFIG_MEMBARRIER int membarrier_state; @@ -1571,7 +1563,7 @@ static inline void update_idle_core(struct rq *rq) { } static inline struct task_struct *task_of(struct sched_entity *se) { - SCHED_WARN_ON(!entity_is_task(se)); + WARN_ON_ONCE(!entity_is_task(se)); return container_of(se, struct task_struct, se); } @@ -1652,7 +1644,7 @@ static inline void assert_clock_updated(struct rq *rq) * The only reason for not seeing a clock update since the * last rq_pin_lock() is if we're currently skipping updates. */ - SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); + WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP); } static inline u64 rq_clock(struct rq *rq) @@ -1699,7 +1691,7 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq) static inline void rq_clock_start_loop_update(struct rq *rq) { lockdep_assert_rq_held(rq); - SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); + WARN_ON_ONCE(rq->clock_update_flags & RQCF_ACT_SKIP); rq->clock_update_flags |= RQCF_ACT_SKIP; } @@ -1712,14 +1704,12 @@ static inline void rq_clock_stop_loop_update(struct rq *rq) struct rq_flags { unsigned long flags; struct pin_cookie cookie; -#ifdef CONFIG_SCHED_DEBUG /* * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the * current pin context is stashed here in case it needs to be * restored in rq_repin_lock(). */ unsigned int clock_update_flags; -#endif }; extern struct balance_callback balance_push_callback; @@ -1770,21 +1760,18 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); -#ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -# ifdef CONFIG_SMP - SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); -# endif +#ifdef CONFIG_SMP + WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); #endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) { -#ifdef CONFIG_SCHED_DEBUG if (rq->clock_update_flags > RQCF_ACT_SKIP) rf->clock_update_flags = RQCF_UPDATED; -#endif + scx_rq_clock_invalidate(rq); lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); } @@ -1793,12 +1780,10 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) { lockdep_repin_lock(__rq_lockp(rq), rf->cookie); -#ifdef CONFIG_SCHED_DEBUG /* * Restore the value we stashed in @rf for this pin context. */ rq->clock_update_flags |= rf->clock_update_flags; -#endif } extern @@ -2072,9 +2057,7 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ -#ifdef CONFIG_SCHED_DEBUG int id; -#endif unsigned long cpumask[]; /* Balance mask */ }; @@ -2114,13 +2097,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) extern int group_balance_cpu(struct sched_group *sg); -#ifdef CONFIG_SCHED_DEBUG extern void update_sched_domain_debugfs(void); extern void dirty_sched_domain_sysctl(int cpu); -#else -static inline void update_sched_domain_debugfs(void) { } -static inline void dirty_sched_domain_sysctl(int cpu) { } -#endif extern int sched_update_scaling(void); @@ -2200,13 +2178,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) } /* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + * Tunables: */ -#ifdef CONFIG_SCHED_DEBUG -# define const_debug __read_mostly -#else -# define const_debug const -#endif #define SCHED_FEAT(name, enabled) \ __SCHED_FEAT_##name , @@ -2218,13 +2191,11 @@ enum { #undef SCHED_FEAT -#ifdef CONFIG_SCHED_DEBUG - /* * To support run-time toggling of sched features, all the translation units * (but core.c) reference the sysctl_sched_features defined in core.c. */ -extern const_debug unsigned int sysctl_sched_features; +extern __read_mostly unsigned int sysctl_sched_features; #ifdef CONFIG_JUMP_LABEL @@ -2246,24 +2217,6 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* !CONFIG_JUMP_LABEL */ -#else /* !SCHED_DEBUG: */ - -/* - * Each translation unit has its own copy of sysctl_sched_features to allow - * constants propagation at compile time and compiler optimization based on - * features default. - */ -#define SCHED_FEAT(name, enabled) \ - (1UL << __SCHED_FEAT_##name) * enabled | -static const_debug __maybe_unused unsigned int sysctl_sched_features = -#include "features.h" - 0; -#undef SCHED_FEAT - -#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) - -#endif /* !SCHED_DEBUG */ - extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; @@ -2685,7 +2638,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { - SCHED_WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); return rq->idle_state; } @@ -2843,12 +2796,11 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); # define SCHED_NR_MIGRATE_BREAK 32 #endif -extern const_debug unsigned int sysctl_sched_nr_migrate; -extern const_debug unsigned int sysctl_sched_migration_cost; +extern __read_mostly unsigned int sysctl_sched_nr_migrate; +extern __read_mostly unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_base_slice; -#ifdef CONFIG_SCHED_DEBUG extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; @@ -2859,7 +2811,6 @@ extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_hot_threshold; -#endif #ifdef CONFIG_SCHED_HRTICK @@ -2932,7 +2883,6 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif -#ifdef CONFIG_SCHED_DEBUG /* * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to * acquire rq lock instead of rq_lock(). So at the end of these two functions @@ -2947,9 +2897,6 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); #endif } -#else -static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { } -#endif #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ __DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ @@ -3162,7 +3109,6 @@ extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); -#ifdef CONFIG_SCHED_DEBUG extern bool sched_debug_verbose; extern void print_cfs_stats(struct seq_file *m, int cpu); @@ -3173,15 +3119,12 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); -# ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, unsigned long tpf, unsigned long gsf, unsigned long gpf); -# endif /* CONFIG_NUMA_BALANCING */ -#else /* !CONFIG_SCHED_DEBUG: */ -static inline void resched_latency_warn(int cpu, u64 latency) { } -#endif /* !CONFIG_SCHED_DEBUG */ +#endif /* CONFIG_NUMA_BALANCING */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); @@ -3259,11 +3202,11 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime); +extern int sched_clock_irqtime; static inline int irqtime_enabled(void) { - return static_branch_likely(&sched_clock_irqtime); + return sched_clock_irqtime; } /* @@ -3394,6 +3337,31 @@ static inline bool update_other_load_avgs(struct rq *rq) { return false; } unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} + +/* + * Enabling static branches would get the cpus_read_lock(), + * check whether uclamp_is_used before enable it to avoid always + * calling cpus_read_lock(). Because we never disable this + * static key once enable it. + */ +static inline void sched_uclamp_enable(void) +{ + if (!uclamp_is_used()) + static_branch_enable(&sched_uclamp_used); +} + static inline unsigned long uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { @@ -3417,7 +3385,7 @@ static inline bool uclamp_rq_is_capped(struct rq *rq) unsigned long rq_util; unsigned long max_util; - if (!static_branch_likely(&sched_uclamp_used)) + if (!uclamp_is_used()) return false; rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); @@ -3426,19 +3394,6 @@ static inline bool uclamp_rq_is_capped(struct rq *rq) return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; } -/* - * When uclamp is compiled in, the aggregation at rq level is 'turned off' - * by default in the fast path and only gets turned on once userspace performs - * an operation that requires it. - * - * Returns true if userspace opted-in to use uclamp and aggregation at rq level - * hence is active. - */ -static inline bool uclamp_is_used(void) -{ - return static_branch_likely(&sched_uclamp_used); -} - #define for_each_clamp_id(clamp_id) \ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) @@ -3486,6 +3441,8 @@ static inline bool uclamp_is_used(void) return false; } +static inline void sched_uclamp_enable(void) {} + static inline unsigned long uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { @@ -3619,6 +3576,7 @@ extern int preempt_dynamic_mode; extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +extern const char *preempt_modes[]; #ifdef CONFIG_SCHED_MM_CID @@ -3698,10 +3656,28 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { struct cpumask *cidmask = mm_cidmask(mm); struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid = __this_cpu_read(pcpu_cid->recent_cid); + int cid, max_nr_cid, allowed_max_nr_cid; + /* + * After shrinking the number of threads or reducing the number + * of allowed cpus, reduce the value of max_nr_cid so expansion + * of cid allocation will preserve cache locality if the number + * of threads or allowed cpus increase again. + */ + max_nr_cid = atomic_read(&mm->max_nr_cid); + while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), + atomic_read(&mm->mm_users))), + max_nr_cid > allowed_max_nr_cid) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ + if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { + max_nr_cid = allowed_max_nr_cid; + break; + } + } /* Try to re-use recent cid. This improves cache locality. */ - if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask)) + cid = __this_cpu_read(pcpu_cid->recent_cid); + if (!mm_cid_is_unset(cid) && cid < max_nr_cid && + !cpumask_test_and_set_cpu(cid, cidmask)) return cid; /* * Expand cid allocation if the maximum number of concurrency @@ -3709,8 +3685,9 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) * and number of threads. Expanding cid allocation as much as * possible improves cache locality. */ - cid = atomic_read(&mm->max_nr_cid); + cid = max_nr_cid; while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) continue; if (!cpumask_test_and_set_cpu(cid, cidmask)) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 19cdbe96f93d..452826df6ae1 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -144,7 +144,7 @@ static inline void psi_enqueue(struct task_struct *p, int flags) if (p->se.sched_delayed) { /* CPU migration of "sleeping" task */ - SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); + WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED)); if (p->in_memstall) set |= TSK_MEMSTALL; if (p->in_iowait) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 456d339be98f..c326de1344fb 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -368,7 +368,7 @@ static int uclamp_validate(struct task_struct *p, * blocking operation which obviously cannot be done while holding * scheduler locks. */ - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); return 0; } @@ -875,7 +875,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lparam; - if (!param || pid < 0) + if (unlikely(!param || pid < 0)) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; @@ -984,7 +984,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, struct sched_attr attr; int retval; - if (!uattr || pid < 0 || flags) + if (unlikely(!uattr || pid < 0 || flags)) return -EINVAL; retval = sched_copy_attr(uattr, &attr); @@ -1049,7 +1049,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) struct task_struct *p; int retval; - if (!param || pid < 0) + if (unlikely(!param || pid < 0)) return -EINVAL; scoped_guard (rcu) { @@ -1085,8 +1085,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, struct task_struct *p; int retval; - if (!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags) + if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || + usize < SCHED_ATTR_SIZE_VER0 || flags)) return -EINVAL; scoped_guard (rcu) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c49aea8c1025..f1ebc60d967f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -6,13 +6,19 @@ #include <linux/bsearch.h> DEFINE_MUTEX(sched_domains_mutex); +void sched_domains_mutex_lock(void) +{ + mutex_lock(&sched_domains_mutex); +} +void sched_domains_mutex_unlock(void) +{ + mutex_unlock(&sched_domains_mutex); +} /* Protected by sched_domains_mutex: */ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; -#ifdef CONFIG_SCHED_DEBUG - static int __init sched_debug_setup(char *str) { sched_debug_verbose = true; @@ -151,15 +157,6 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) break; } } -#else /* !CONFIG_SCHED_DEBUG */ - -# define sched_debug_verbose 0 -# define sched_domain_debug(sd, cpu) do { } while (0) -static inline bool sched_debug(void) -{ - return false; -} -#endif /* CONFIG_SCHED_DEBUG */ /* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | @@ -560,7 +557,7 @@ static int init_rootdomain(struct root_domain *rd) rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif - rd->visit_gen = 0; + rd->visit_cookie = 0; init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; @@ -2275,9 +2272,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sgc) return -ENOMEM; -#ifdef CONFIG_SCHED_DEBUG sgc->id = j; -#endif *per_cpu_ptr(sdd->sgc, j) = sgc; } @@ -2680,7 +2675,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock and sched_domains_mutex held */ -void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], +static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; @@ -2712,21 +2707,8 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_cur[i], doms_new[j]) && - dattrs_equal(dattr_cur, i, dattr_new, j)) { - struct root_domain *rd; - - /* - * This domain won't be destroyed and as such - * its dl_bw->total_bw needs to be cleared. - * Tasks contribution will be then recomputed - * in function dl_update_tasks_root_domain(), - * dl_servers contribution in function - * dl_restore_server_root_domain(). - */ - rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; - dl_clear_root_domain(rd); + dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; - } } /* No match - a current sched domain not in new doms_new[] */ detach_destroy_domains(doms_cur[i]); @@ -2783,6 +2765,7 @@ match3: ndoms_cur = ndoms_new; update_sched_domain_debugfs(); + dl_rebuild_rd_accounting(); } /* @@ -2791,7 +2774,7 @@ match3: void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f59381c4a2ff..41aa761c7738 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -29,13 +29,11 @@ #include <linux/syscalls.h> #include <linux/sysctl.h> +#include <asm/syscall.h> + /* Not exposed in headers: strictly internal use only. */ #define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) -#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER -#include <asm/syscall.h> -#endif - #ifdef CONFIG_SECCOMP_FILTER #include <linux/file.h> #include <linux/filter.h> @@ -576,6 +574,9 @@ void seccomp_filter_release(struct task_struct *tsk) if (WARN_ON((tsk->flags & PF_EXITING) == 0)) return; + if (READ_ONCE(tsk->seccomp.filter) == NULL) + return; + spin_lock_irq(&tsk->sighand->siglock); orig = tsk->seccomp.filter; /* Detach task from its filter tree. */ @@ -601,6 +602,13 @@ static inline void seccomp_sync_threads(unsigned long flags) BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); assert_spin_locked(¤t->sighand->siglock); + /* + * Don't touch any of the threads if the process is being killed. + * This allows for a lockless check in seccomp_filter_release. + */ + if (current->signal->flags & SIGNAL_GROUP_EXIT) + return; + /* Synchronize all threads. */ caller = current; for_each_thread(caller, thread) { @@ -749,6 +757,15 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, if (WARN_ON_ONCE(!fprog)) return false; + /* Our single exception to filtering. */ +#ifdef __NR_uretprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + if (sd->nr == __NR_uretprobe) + return true; +#endif + for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; u16 code = insn->code; @@ -1023,6 +1040,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, */ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, +#ifdef __NR_uretprobe + __NR_uretprobe, +#endif -1, /* negative terminated */ }; @@ -1062,6 +1082,13 @@ void secure_computing_strict(int this_syscall) else BUG(); } +int __secure_computing(void) +{ + int this_syscall = syscall_get_nr(current, current_pt_regs()); + + secure_computing_strict(this_syscall); + return 0; +} #else #ifdef CONFIG_SECCOMP_FILTER @@ -1213,13 +1240,12 @@ out: return -1; } -static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, - const bool recheck_after_trace) +static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) { u32 filter_ret, action; + struct seccomp_data sd; struct seccomp_filter *match = NULL; int data; - struct seccomp_data sd_local; /* * Make sure that any changes to mode from another thread have @@ -1227,12 +1253,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ smp_rmb(); - if (!sd) { - populate_seccomp_data(&sd_local); - sd = &sd_local; - } + populate_seccomp_data(&sd); - filter_ret = seccomp_run_filters(sd, &match); + filter_ret = seccomp_run_filters(&sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; @@ -1290,13 +1313,13 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, * a reload of all registers. This does not goto skip since * a skip would have already been reported. */ - if (__seccomp_filter(this_syscall, NULL, true)) + if (__seccomp_filter(this_syscall, true)) return -1; return 0; case SECCOMP_RET_USER_NOTIF: - if (seccomp_do_user_notification(this_syscall, match, sd)) + if (seccomp_do_user_notification(this_syscall, match, &sd)) goto skip; return 0; @@ -1338,8 +1361,7 @@ skip: return -1; } #else -static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, - const bool recheck_after_trace) +static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) { BUG(); @@ -1347,7 +1369,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, } #endif -int __secure_computing(const struct seccomp_data *sd) +int __secure_computing(void) { int mode = current->seccomp.mode; int this_syscall; @@ -1356,15 +1378,14 @@ int __secure_computing(const struct seccomp_data *sd) unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return 0; - this_syscall = sd ? sd->nr : - syscall_get_nr(current, current_pt_regs()); + this_syscall = syscall_get_nr(current, current_pt_regs()); switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ return 0; case SECCOMP_MODE_FILTER: - return __seccomp_filter(this_syscall, sd, false); + return __seccomp_filter(this_syscall, false); /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ case SECCOMP_MODE_DEAD: WARN_ON_ONCE(1); diff --git a/kernel/signal.c b/kernel/signal.c index 875e97f6205a..86ba66d95da5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2092,7 +2092,7 @@ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueu * from a non-periodic timer, then just drop the reference * count. Otherwise queue it on the ignored list. */ - if (tmr->it_signal && tmr->it_sig_periodic) + if (posixtimer_valid(tmr) && tmr->it_sig_periodic) hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers); else posixtimer_putref(tmr); @@ -2180,8 +2180,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); /* - * tsk is a group leader and has no threads, wake up the - * non-PIDFD_THREAD waiters. + * Notify for thread-group leaders without subthreads. */ if (thread_group_empty(tsk)) do_notify_pidfd(tsk); @@ -4009,6 +4008,47 @@ static struct pid *pidfd_to_pid(const struct file *file) (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ PIDFD_SIGNAL_PROCESS_GROUP) +static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type, + siginfo_t __user *info, unsigned int flags) +{ + kernel_siginfo_t kinfo; + + switch (flags) { + case PIDFD_SIGNAL_THREAD: + type = PIDTYPE_PID; + break; + case PIDFD_SIGNAL_THREAD_GROUP: + type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_PROCESS_GROUP: + type = PIDTYPE_PGID; + break; + } + + if (info) { + int ret; + + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + return ret; + + if (unlikely(sig != kinfo.si_signo)) + return -EINVAL; + + /* Only allow sending arbitrary signals to yourself. */ + if ((task_pid(current) != pid || type > PIDTYPE_TGID) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) + return -EPERM; + } else { + prepare_kill_siginfo(sig, &kinfo, type); + } + + if (type == PIDTYPE_PGID) + return kill_pgrp_info(sig, &kinfo, pid); + + return kill_pid_info_type(sig, &kinfo, pid, type); +} + /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process @@ -4026,9 +4066,7 @@ static struct pid *pidfd_to_pid(const struct file *file) SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { - int ret; struct pid *pid; - kernel_siginfo_t kinfo; enum pid_type type; /* Enforce flags be set to 0 until we add an extension. */ @@ -4039,57 +4077,39 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; - CLASS(fd, f)(pidfd); - if (fd_empty(f)) - return -EBADF; + switch (pidfd) { + case PIDFD_SELF_THREAD: + pid = get_task_pid(current, PIDTYPE_PID); + type = PIDTYPE_PID; + break; + case PIDFD_SELF_THREAD_GROUP: + pid = get_task_pid(current, PIDTYPE_TGID); + type = PIDTYPE_TGID; + break; + default: { + CLASS(fd, f)(pidfd); + if (fd_empty(f)) + return -EBADF; - /* Is this a pidfd? */ - pid = pidfd_to_pid(fd_file(f)); - if (IS_ERR(pid)) - return PTR_ERR(pid); + /* Is this a pidfd? */ + pid = pidfd_to_pid(fd_file(f)); + if (IS_ERR(pid)) + return PTR_ERR(pid); - if (!access_pidfd_pidns(pid)) - return -EINVAL; + if (!access_pidfd_pidns(pid)) + return -EINVAL; - switch (flags) { - case 0: /* Infer scope from the type of pidfd. */ if (fd_file(f)->f_flags & PIDFD_THREAD) type = PIDTYPE_PID; else type = PIDTYPE_TGID; - break; - case PIDFD_SIGNAL_THREAD: - type = PIDTYPE_PID; - break; - case PIDFD_SIGNAL_THREAD_GROUP: - type = PIDTYPE_TGID; - break; - case PIDFD_SIGNAL_PROCESS_GROUP: - type = PIDTYPE_PGID; - break; - } - if (info) { - ret = copy_siginfo_from_user_any(&kinfo, info); - if (unlikely(ret)) - return ret; - - if (unlikely(sig != kinfo.si_signo)) - return -EINVAL; - - /* Only allow sending arbitrary signals to yourself. */ - if ((task_pid(current) != pid || type > PIDTYPE_TGID) && - (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) - return -EPERM; - } else { - prepare_kill_siginfo(sig, &kinfo, type); + return do_pidfd_send_signal(pid, sig, type, info, flags); + } } - if (type == PIDTYPE_PGID) - return kill_pgrp_info(sig, &kinfo, pid); - else - return kill_pid_info_type(sig, &kinfo, pid, type); + return do_pidfd_send_signal(pid, sig, type, info, flags); } static int diff --git a/kernel/softirq.c b/kernel/softirq.c index 4dae6ac2e83f..513b1945987c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -126,6 +126,18 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = { .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock), }; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key bh_lock_key; +struct lockdep_map bh_lock_map = { + .name = "local_bh", + .key = &bh_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */ + .lock_type = LD_LOCK_PERCPU, +}; +EXPORT_SYMBOL_GPL(bh_lock_map); +#endif + /** * local_bh_blocked() - Check for idle whether BH processing is blocked * @@ -148,6 +160,8 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); + lock_map_acquire_read(&bh_lock_map); + /* First entry of a task into a BH disabled section? */ if (!current->softirq_disable_cnt) { if (preemptible()) { @@ -211,6 +225,8 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); + lock_map_release(&bh_lock_map); + local_irq_save(flags); curcnt = __this_cpu_read(softirq_ctrl.cnt); @@ -261,6 +277,8 @@ static inline void ksoftirqd_run_begin(void) /* Counterpart to ksoftirqd_run_begin() */ static inline void ksoftirqd_run_end(void) { + /* pairs with the lock_map_acquire_read() in ksoftirqd_run_begin() */ + lock_map_release(&bh_lock_map); __local_bh_enable(SOFTIRQ_OFFSET, true); WARN_ON_ONCE(in_interrupt()); local_irq_enable(); diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index bb7d066a7c39..269683d41aa9 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -206,7 +206,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func) continue; } - arch_static_call_transform(site_addr, NULL, func, + arch_static_call_transform(site_addr, tramp, func, static_call_is_tail(site)); } } @@ -325,13 +325,12 @@ static int __static_call_mod_text_reserved(void *start, void *end) struct module *mod; int ret; - preempt_disable(); - mod = __module_text_address((unsigned long)start); - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); - if (!try_module_get(mod)) - mod = NULL; - preempt_enable(); - + scoped_guard(rcu) { + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + } if (!mod) return 0; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 8896d844d738..5d2d0562115b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -250,6 +250,7 @@ static int multi_cpu_stop(void *data) * be detected and reported on their side. */ touch_nmi_watchdog(); + /* Also suppress RCU CPU stall warnings. */ rcu_momentary_eqs(); } } while (curstate != MULTI_STOP_EXIT); diff --git a/kernel/sys.c b/kernel/sys.c index cb366ff8703a..c434968e9f5d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1085,6 +1085,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; + struct pid *pids[PIDTYPE_MAX] = { 0 }; struct pid *pgrp; int err; @@ -1142,13 +1143,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) goto out; if (task_pgrp(p) != pgrp) - change_pid(p, PIDTYPE_PGID, pgrp); + change_pid(pids, p, PIDTYPE_PGID, pgrp); err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); rcu_read_unlock(); + free_pids(pids); return err; } @@ -1222,21 +1224,22 @@ out: return retval; } -static void set_special_pids(struct pid *pid) +static void set_special_pids(struct pid **pids, struct pid *pid) { struct task_struct *curr = current->group_leader; if (task_session(curr) != pid) - change_pid(curr, PIDTYPE_SID, pid); + change_pid(pids, curr, PIDTYPE_SID, pid); if (task_pgrp(curr) != pid) - change_pid(curr, PIDTYPE_PGID, pid); + change_pid(pids, curr, PIDTYPE_PGID, pid); } int ksys_setsid(void) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); + struct pid *pids[PIDTYPE_MAX] = { 0 }; pid_t session = pid_vnr(sid); int err = -EPERM; @@ -1252,13 +1255,14 @@ int ksys_setsid(void) goto out; group_leader->signal->leader = 1; - set_special_pids(sid); + set_special_pids(pids, sid); proc_clear_tty(group_leader); err = session; out: write_unlock_irq(&tasklist_lock); + free_pids(pids); if (err > 0) { proc_sid_connector(group_leader); sched_autogroup_create_attach(group_leader); @@ -2811,6 +2815,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = arch_lock_shadow_stack_status(me, arg2); break; + case PR_TIMER_CREATE_RESTORE_IDS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = posixtimer_create_prctl(arg2); + break; default: trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cb57da499ebb..3b7a7308e35b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -20,9 +20,6 @@ */ #include <linux/module.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/slab.h> #include <linux/sysctl.h> #include <linux/bitmap.h> #include <linux/signal.h> @@ -31,7 +28,6 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> -#include <linux/kmemleak.h> #include <linux/filter.h> #include <linux/fs.h> #include <linux/init.h> @@ -42,26 +38,20 @@ #include <linux/highuid.h> #include <linux/writeback.h> #include <linux/ratelimit.h> -#include <linux/hugetlb.h> #include <linux/initrd.h> #include <linux/key.h> #include <linux/times.h> #include <linux/limits.h> -#include <linux/dcache.h> #include <linux/syscalls.h> -#include <linux/vmstat.h> #include <linux/nfs_fs.h> #include <linux/acpi.h> #include <linux/reboot.h> #include <linux/ftrace.h> -#include <linux/perf_event.h> -#include <linux/oom.h> #include <linux/kmod.h> #include <linux/capability.h> #include <linux/binfmts.h> #include <linux/sched/sysctl.h> #include <linux/mount.h> -#include <linux/userfaultfd_k.h> #include <linux/pid.h> #include "../lib/kstrtox.h" @@ -91,12 +81,6 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ - -#ifdef CONFIG_PERF_EVENTS -static const int six_hundred_forty_kb = 640 * 1024; -#endif - - static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -129,12 +113,6 @@ enum sysctl_writes_mode { static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; #endif /* CONFIG_PROC_SYSCTL */ - -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) -int sysctl_legacy_va_layout; -#endif - #endif /* CONFIG_SYSCTL */ /* @@ -1794,15 +1772,6 @@ static const struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_MAXOLDUID, }, -#ifdef CONFIG_S390 - { - .procname = "userprocess_debug", - .data = &show_unhandled_signals, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "panic_on_oops", .data = &panic_on_oops, @@ -1831,16 +1800,6 @@ static const struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - #if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ defined(CONFIG_DEBUG_STACKOVERFLOW) { @@ -1851,43 +1810,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_X86) - { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_type", - .data = &bootloader_type, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_version", - .data = &bootloader_version, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "io_delay_type", - .data = &io_delay_type, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #if defined(CONFIG_MMU) { .procname = "randomize_va_space", @@ -1897,24 +1819,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", - .data = &spin_retry, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) - { - .procname = "acpi_video_flags", - .data = &acpi_realmode_flags, - .maxlen = sizeof (unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", @@ -1933,63 +1837,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_PERF_EVENTS - /* - * User-space scripts rely on the existence of this file - * as a feature check for perf_events being enabled. - * - * So it's an ABI, do not remove! - */ - { - .procname = "perf_event_paranoid", - .data = &sysctl_perf_event_paranoid, - .maxlen = sizeof(sysctl_perf_event_paranoid), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_mlock_kb", - .data = &sysctl_perf_event_mlock, - .maxlen = sizeof(sysctl_perf_event_mlock), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_max_sample_rate", - .data = &sysctl_perf_event_sample_rate, - .maxlen = sizeof(sysctl_perf_event_sample_rate), - .mode = 0644, - .proc_handler = perf_event_max_sample_rate_handler, - .extra1 = SYSCTL_ONE, - }, - { - .procname = "perf_cpu_time_max_percent", - .data = &sysctl_perf_cpu_time_max_percent, - .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), - .mode = 0644, - .proc_handler = perf_cpu_time_max_percent_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "perf_event_max_stack", - .data = &sysctl_perf_event_max_stack, - .maxlen = sizeof(sysctl_perf_event_max_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&six_hundred_forty_kb, - }, - { - .procname = "perf_event_max_contexts_per_stack", - .data = &sysctl_perf_event_max_contexts_per_stack, - .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_THOUSAND, - }, -#endif { .procname = "panic_on_warn", .data = &panic_on_warn, @@ -2021,215 +1868,9 @@ static const struct ctl_table kern_table[] = { #endif }; -static const struct ctl_table vm_table[] = { - { - .procname = "overcommit_memory", - .data = &sysctl_overcommit_memory, - .maxlen = sizeof(sysctl_overcommit_memory), - .mode = 0644, - .proc_handler = overcommit_policy_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "overcommit_ratio", - .data = &sysctl_overcommit_ratio, - .maxlen = sizeof(sysctl_overcommit_ratio), - .mode = 0644, - .proc_handler = overcommit_ratio_handler, - }, - { - .procname = "overcommit_kbytes", - .data = &sysctl_overcommit_kbytes, - .maxlen = sizeof(sysctl_overcommit_kbytes), - .mode = 0644, - .proc_handler = overcommit_kbytes_handler, - }, - { - .procname = "page-cluster", - .data = &page_cluster, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&page_cluster_max, - }, - { - .procname = "dirtytime_expire_seconds", - .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirtytime_expire_interval), - .mode = 0644, - .proc_handler = dirtytime_interval_handler, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO_HUNDRED, - }, -#ifdef CONFIG_NUMA - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif - { - .procname = "drop_caches", - .data = &sysctl_drop_caches, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = drop_caches_sysctl_handler, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_FOUR, - }, - { - .procname = "page_lock_unfairness", - .data = &sysctl_page_lock_unfairness, - .maxlen = sizeof(sysctl_page_lock_unfairness), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#ifdef CONFIG_MMU - { - .procname = "max_map_count", - .data = &sysctl_max_map_count, - .maxlen = sizeof(sysctl_max_map_count), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#else - { - .procname = "nr_trim_pages", - .data = &sysctl_nr_trim_pages, - .maxlen = sizeof(sysctl_nr_trim_pages), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif - { - .procname = "vfs_cache_pressure", - .data = &sysctl_vfs_cache_pressure, - .maxlen = sizeof(sysctl_vfs_cache_pressure), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ - defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) - { - .procname = "legacy_va_layout", - .data = &sysctl_legacy_va_layout, - .maxlen = sizeof(sysctl_legacy_va_layout), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_NUMA - { - .procname = "zone_reclaim_mode", - .data = &node_reclaim_mode, - .maxlen = sizeof(node_reclaim_mode), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, -#endif -#ifdef CONFIG_SMP - { - .procname = "stat_interval", - .data = &sysctl_stat_interval, - .maxlen = sizeof(sysctl_stat_interval), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "stat_refresh", - .data = NULL, - .maxlen = 0, - .mode = 0600, - .proc_handler = vmstat_refresh, - }, -#endif -#ifdef CONFIG_MMU - { - .procname = "mmap_min_addr", - .data = &dac_mmap_min_addr, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = mmap_min_addr_handler, - }, -#endif -#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ - (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) - { - .procname = "vdso_enabled", -#ifdef CONFIG_X86_32 - .data = &vdso32_enabled, - .maxlen = sizeof(vdso32_enabled), -#else - .data = &vdso_enabled, - .maxlen = sizeof(vdso_enabled), -#endif - .mode = 0644, - .proc_handler = proc_dointvec, - .extra1 = SYSCTL_ZERO, - }, -#endif - { - .procname = "user_reserve_kbytes", - .data = &sysctl_user_reserve_kbytes, - .maxlen = sizeof(sysctl_user_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "admin_reserve_kbytes", - .data = &sysctl_admin_reserve_kbytes, - .maxlen = sizeof(sysctl_admin_reserve_kbytes), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS - { - .procname = "mmap_rnd_bits", - .data = &mmap_rnd_bits, - .maxlen = sizeof(mmap_rnd_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_bits_min, - .extra2 = (void *)&mmap_rnd_bits_max, - }, -#endif -#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS - { - .procname = "mmap_rnd_compat_bits", - .data = &mmap_rnd_compat_bits, - .maxlen = sizeof(mmap_rnd_compat_bits), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&mmap_rnd_compat_bits_min, - .extra2 = (void *)&mmap_rnd_compat_bits_max, - }, -#endif -}; - int __init sysctl_init_bases(void) { register_sysctl_init("kernel", kern_table); - register_sysctl_init("vm", vm_table); return 0; } diff --git a/kernel/time/Makefile b/kernel/time/Makefile index fe0ae82124fe..e6e9b85d4db5 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 + +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_sched_clock.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y += time.o timer.o hrtimer.o sleep_timeout.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o timecounter.o alarmtimer.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7304d7cf47f2..e0eeacbe2521 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -373,16 +373,18 @@ void clocksource_verify_percpu(struct clocksource *cs) cpumask_clear(&cpus_ahead); cpumask_clear(&cpus_behind); cpus_read_lock(); - preempt_disable(); + migrate_disable(); clocksource_verify_choose_cpus(); if (cpumask_empty(&cpus_chosen)) { - preempt_enable(); + migrate_enable(); cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); return; } testcpu = smp_processor_id(); - pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", + cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + preempt_disable(); for_each_cpu(cpu, &cpus_chosen) { if (cpu == testcpu) continue; @@ -402,6 +404,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cs_nsec_min = cs_nsec; } preempt_enable(); + migrate_enable(); cpus_read_unlock(); if (!cpumask_empty(&cpus_ahead)) pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", @@ -1507,7 +1510,7 @@ static int __init boot_override_clocksource(char* str) { mutex_lock(&clocksource_mutex); if (str) - strscpy(override_name, str, sizeof(override_name)); + strscpy(override_name, str); mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f6d8df94045c..22376a1a75b9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -58,6 +58,8 @@ #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) +static void retrigger_next_event(void *arg); + /* * The timer bases: * @@ -111,18 +113,17 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - } + }, + .csd = CSD_INIT(retrigger_next_event, NULL) }; -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, - [CLOCK_TAI] = HRTIMER_BASE_TAI, -}; +static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + else + return likely(base->online); +} /* * Functions and macros which are different for UP/SMP systems are kept in a @@ -145,11 +146,6 @@ static struct hrtimer_cpu_base migration_cpu_base = { #define migration_base migration_cpu_base.clock_base[0] -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return base == &migration_base; -} - /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -183,27 +179,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * We do not migrate the timer when it is expiring before the next - * event on the target cpu. When high resolution is enabled, we cannot - * reprogram the target cpu hardware and we would cause it to fire - * late. To keep it simple, we handle the high resolution enabled and - * disabled case similar. + * Check if the elected target is suitable considering its next + * event and the hotplug state of the current CPU. + * + * If the elected target is remote and its next event is after the timer + * to queue, then a remote reprogram is necessary. However there is no + * guarantee the IPI handling the operation would arrive in time to meet + * the high resolution deadline. In this case the local CPU becomes a + * preferred target, unless it is offline. + * + * High and low resolution modes are handled the same way for simplicity. * * Called with cpu_base->lock of target cpu held. */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, + struct hrtimer_cpu_base *new_cpu_base, + struct hrtimer_cpu_base *this_cpu_base) { ktime_t expires; + /* + * The local CPU clockevent can be reprogrammed. Also get_target_base() + * guarantees it is online. + */ + if (new_cpu_base == this_cpu_base) + return true; + + /* + * The offline local CPU can't be the default target if the + * next remote target event is after this timer. Keep the + * elected new base. An IPI will we issued to reprogram + * it as a last resort. + */ + if (!hrtimer_base_is_online(this_cpu_base)) + return true; + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires < new_base->cpu_base->expires_next; + + return expires >= new_base->cpu_base->expires_next; } -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { + if (!hrtimer_base_is_online(base)) { + int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + return &per_cpu(hrtimer_bases, cpu); + } + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); @@ -254,8 +277,8 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, + this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -264,8 +287,7 @@ again: } WRITE_ONCE(timer->base, new_base); } else { - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { new_cpu_base = this_cpu_base; goto again; } @@ -275,11 +297,6 @@ again: #else /* CONFIG_SMP */ -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return false; -} - static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->cpu_base->lock) @@ -716,8 +733,6 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -static void retrigger_next_event(void *arg); - /* * Switch to high resolution mode */ @@ -1205,6 +1220,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { + struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; @@ -1216,10 +1232,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; /* + * Don't force local queuing if this enqueue happens on a unplugged + * CPU after hrtimer_cpu_dying() has been invoked. + */ + force_local &= this_cpu_base->online; + + /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. @@ -1248,8 +1270,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + if (!force_local) { + /* + * If the current CPU base is online, then the timer is + * never queued on a remote CPU if it would be the first + * expiring timer there. + */ + if (hrtimer_base_is_online(this_cpu_base)) + return first; + + /* + * Timer was enqueued remote because the current base is + * already offline. If the timer is the first to expire, + * kick the remote CPU to reprogram the clock event. + */ + if (first) { + struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + + smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); + } + return 0; + } /* * Timer was forced to stay on the current CPU to avoid @@ -1370,6 +1411,18 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, } } +#ifdef CONFIG_SMP +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return base == &migration_base; +} +#else +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return false; +} +#endif + /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was @@ -1524,19 +1577,19 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - if (likely(clock_id < MAX_CLOCKS)) { - int base = hrtimer_clock_to_base_table[clock_id]; - - if (likely(base != HRTIMER_MAX_CLOCK_BASES)) - return base; + switch (clock_id) { + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; + case CLOCK_MONOTONIC: + return HRTIMER_BASE_MONOTONIC; + case CLOCK_BOOTTIME: + return HRTIMER_BASE_BOOTTIME; + case CLOCK_TAI: + return HRTIMER_BASE_TAI; + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; } - WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); - return HRTIMER_BASE_MONOTONIC; -} - -static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) -{ - return HRTIMER_NORESTART; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0775b9ec952a..e3642278df43 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -165,26 +165,26 @@ static struct timens_offset offset_from_ts(struct timespec64 off) * HVCLOCK * VVAR * - * The check for vdso_data->clock_mode is in the unlikely path of + * The check for vdso_clock->clock_mode is in the unlikely path of * the seq begin magic. So for the non-timens case most of the time * 'seq' is even, so the branch is not taken. * * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check - * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * - * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which * enforces the time namespace handling path. */ -static void timens_setup_vdso_data(struct vdso_data *vdata, - struct time_namespace *ns) +static void timens_setup_vdso_clock_data(struct vdso_clock *vc, + struct time_namespace *ns) { - struct timens_offset *offset = vdata->offset; + struct timens_offset *offset = vc->offset; struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); - vdata->seq = 1; - vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; + vc->seq = 1; + vc->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; @@ -219,7 +219,8 @@ static DEFINE_MUTEX(offset_lock); static void timens_set_vvar_page(struct task_struct *task, struct time_namespace *ns) { - struct vdso_data *vdata; + struct vdso_time_data *vdata; + struct vdso_clock *vc; unsigned int i; if (ns == &init_time_ns) @@ -235,10 +236,11 @@ static void timens_set_vvar_page(struct task_struct *task, goto out; ns->frozen_offsets = true; - vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + vdata = page_address(ns->vvar_page); + vc = vdata->clock_data; for (i = 0; i < CS_BASES; i++) - timens_setup_vdso_data(&vdata[i], ns); + timens_setup_vdso_clock_data(&vc[i], ns); out: mutex_unlock(&offset_lock); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 163e7a2033b6..b837d3d9d325 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -678,8 +678,7 @@ void ntp_notify_cmos_timer(bool offset_set) static void __init ntp_init_cmos_sync(void) { - hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - sync_hrtimer.function = sync_timer_callback; + hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS); } #else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ static inline void __init ntp_init_cmos_sync(void) { } diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 1af0bb2cc45c..101a0f7c43e0 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp, return err; } -#ifdef CONFIG_COMPAT -static long posix_clock_compat_ioctl(struct file *fp, - unsigned int cmd, unsigned long arg) -{ - struct posix_clock_context *pccontext = fp->private_data; - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENOTTY; - - if (!clk) - return -ENODEV; - - if (clk->ops.ioctl) - err = clk->ops.ioctl(pccontext, cmd, arg); - - put_posix_clock(clk); - - return err; -} -#endif - static int posix_clock_open(struct inode *inode, struct file *fp) { int err; @@ -129,6 +109,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) goto out; } pccontext->clk = clk; + pccontext->fp = fp; if (clk->ops.open) { err = clk->ops.open(pccontext, fp->f_mode); if (err) { @@ -171,11 +152,9 @@ static const struct file_operations posix_clock_file_operations = { .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, + .compat_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, -#ifdef CONFIG_COMPAT - .compat_ioctl = posix_clock_compat_ioctl, -#endif }; int posix_clock_register(struct posix_clock *clk, struct device *dev) @@ -251,7 +230,7 @@ static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) if (err) return err; - if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + if (tx->modes && (cd.fp->f_mode & FMODE_WRITE) == 0) { err = -EACCES; goto out; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1b675aee99a9..6222112533a7 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -9,28 +9,23 @@ * * These are all the functions necessary to implement POSIX clocks & timers */ -#include <linux/mm.h> +#include <linux/compat.h> +#include <linux/compiler.h> +#include <linux/init.h> +#include <linux/jhash.h> #include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/mutex.h> -#include <linux/sched/task.h> - -#include <linux/uaccess.h> #include <linux/list.h> -#include <linux/init.h> -#include <linux/compiler.h> -#include <linux/hash.h> +#include <linux/memblock.h> +#include <linux/nospec.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> +#include <linux/prctl.h> +#include <linux/sched/task.h> +#include <linux/slab.h> #include <linux/syscalls.h> -#include <linux/wait.h> -#include <linux/workqueue.h> -#include <linux/export.h> -#include <linux/hashtable.h> -#include <linux/compat.h> -#include <linux/nospec.h> +#include <linux/time.h> #include <linux/time_namespace.h> +#include <linux/uaccess.h> #include "timekeeping.h" #include "posix-timers.h" @@ -46,39 +41,65 @@ static struct kmem_cache *posix_timers_cache; * This allows checkpoint/restore to reconstruct the exact timer IDs for * a process. */ -static DEFINE_HASHTABLE(posix_timers_hashtable, 9); -static DEFINE_SPINLOCK(hash_lock); +struct timer_hash_bucket { + spinlock_t lock; + struct hlist_head head; +}; + +static struct { + struct timer_hash_bucket *buckets; + unsigned long mask; +} __timer_data __ro_after_init __aligned(2*sizeof(long)); + +#define timer_buckets (__timer_data.buckets) +#define timer_hashmask (__timer_data.mask) static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; +#define TIMER_ANY_ID INT_MIN + /* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); +static struct k_itimer *__lock_timer(timer_t timer_id); -#define lock_timer(tid, flags) \ -({ struct k_itimer *__timr; \ - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ - __timr; \ +#define lock_timer(tid) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \ + __timr; \ }) -static int hash(struct signal_struct *sig, unsigned int nr) +static inline void unlock_timer(struct k_itimer *timr) { - return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); + if (likely((timr))) + spin_unlock_irq(&timr->it_lock); } -static struct k_itimer *__posix_timers_find(struct hlist_head *head, - struct signal_struct *sig, - timer_t id) +#define scoped_timer_get_or_fail(_id) \ + scoped_cond_guard(lock_timer, return -EINVAL, _id) + +#define scoped_timer (scope) + +DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id); +DEFINE_CLASS_IS_COND_GUARD(lock_timer); + +static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr) { + return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask]; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct timer_hash_bucket *bucket = hash_bucket(sig, id); struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { + hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) { /* timer->it_signal can be set concurrently */ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; @@ -86,46 +107,88 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, return NULL; } -static struct k_itimer *posix_timer_by_id(timer_t id) +static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer) { - struct signal_struct *sig = current->signal; - struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + unsigned long val = (unsigned long)timer->it_signal; - return __posix_timers_find(head, sig, id); + /* + * Mask out bit 0, which acts as invalid marker to prevent + * posix_timer_by_id() detecting it as valid. + */ + return (struct signal_struct *)(val & ~1UL); } -static int posix_timer_add(struct k_itimer *timer) +static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig, + timer_t id) { - struct signal_struct *sig = current->signal; - struct hlist_head *head; - unsigned int cnt, id; + struct hlist_head *head = &bucket->head; + struct k_itimer *timer; - /* - * FIXME: Replace this by a per signal struct xarray once there is - * a plan to handle the resulting CRIU regression gracefully. - */ - for (cnt = 0; cnt <= INT_MAX; cnt++) { - spin_lock(&hash_lock); - id = sig->next_posix_timer_id; + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) { + if ((posix_sig_owner(timer) == sig) && (timer->it_id == id)) + return true; + } + return false; +} - /* Write the next ID back. Clamp it to the positive space */ - sig->next_posix_timer_id = (id + 1) & INT_MAX; +static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id) +{ + struct timer_hash_bucket *bucket = hash_bucket(sig, id); - head = &posix_timers_hashtable[hash(sig, id)]; - if (!__posix_timers_find(head, sig, id)) { - hlist_add_head_rcu(&timer->t_hash, head); - spin_unlock(&hash_lock); - return id; + scoped_guard (spinlock, &bucket->lock) { + /* + * Validate under the lock as this could have raced against + * another thread ending up with the same ID, which is + * highly unlikely, but possible. + */ + if (!posix_timer_hashed(bucket, sig, id)) { + /* + * Set the timer ID and the signal pointer to make + * it identifiable in the hash table. The signal + * pointer has bit 0 set to indicate that it is not + * yet fully initialized. posix_timer_hashed() + * masks this bit out, but the syscall lookup fails + * to match due to it being set. This guarantees + * that there can't be duplicate timer IDs handed + * out. + */ + timer->it_id = (timer_t)id; + timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL); + hlist_add_head_rcu(&timer->t_hash, &bucket->head); + return true; } - spin_unlock(&hash_lock); } - /* POSIX return code when no timer ID could be allocated */ - return -EAGAIN; + return false; } -static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +static int posix_timer_add(struct k_itimer *timer, int req_id) { - spin_unlock_irqrestore(&timr->it_lock, flags); + struct signal_struct *sig = current->signal; + + if (unlikely(req_id != TIMER_ANY_ID)) { + if (!posix_timer_add_at(timer, sig, req_id)) + return -EBUSY; + + /* + * Move the ID counter past the requested ID, so that after + * switching back to normal mode the IDs are outside of the + * exact allocated region. That avoids ID collisions on the + * next regular timer_create() invocations. + */ + atomic_set(&sig->next_posix_timer_id, req_id + 1); + return req_id; + } + + for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) { + /* Get the next timer ID and clamp it to positive space */ + unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX; + + if (posix_timer_add_at(timer, sig, id)) + return id; + cond_resched(); + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; } static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) @@ -222,9 +285,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) static __init int init_posix_timers(void) { - posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof(struct k_itimer), 0, - SLAB_PANIC | SLAB_ACCOUNT, NULL); + posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer), + __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL); return 0; } __initcall(init_posix_timers); @@ -259,7 +321,7 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it * since the signal was queued. In either case, don't rearm and * drop the signal. */ - if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal)) + if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr))) return false; if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) @@ -304,6 +366,9 @@ void posix_timer_queue_signal(struct k_itimer *timr) { lockdep_assert_held(&timr->it_lock); + if (!posixtimer_valid(timr)) + return; + timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED; posixtimer_send_sigqueue(timr); } @@ -324,6 +389,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +long posixtimer_create_prctl(unsigned long ctrl) +{ + switch (ctrl) { + case PR_TIMER_CREATE_RESTORE_IDS_OFF: + current->signal->timer_create_restore_ids = 0; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_ON: + current->signal->timer_create_restore_ids = 1; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_GET: + return current->signal->timer_create_restore_ids; + } + return -EINVAL; +} + static struct pid *good_sigevent(sigevent_t * event) { struct pid *pid = task_tgid(current); @@ -350,8 +430,12 @@ static struct pid *good_sigevent(sigevent_t * event) static struct k_itimer *alloc_posix_timer(void) { - struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + struct k_itimer *tmr; + if (unlikely(!posix_timers_cache)) + return NULL; + + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; @@ -373,15 +457,16 @@ void posixtimer_free_timer(struct k_itimer *tmr) static void posix_timer_unhash_and_free(struct k_itimer *tmr) { - spin_lock(&hash_lock); - hlist_del_rcu(&tmr->t_hash); - spin_unlock(&hash_lock); + struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id); + + scoped_guard (spinlock, &bucket->lock) + hlist_del_rcu(&tmr->t_hash); posixtimer_putref(tmr); } static int common_timer_create(struct k_itimer *new_timer) { - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0); return 0; } @@ -390,6 +475,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); + timer_t req_id = TIMER_ANY_ID; struct k_itimer *new_timer; int error, new_timer_id; @@ -404,26 +490,32 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, spin_lock_init(&new_timer->it_lock); + /* Special case for CRIU to restore timers with a given timer ID. */ + if (unlikely(current->signal->timer_create_restore_ids)) { + if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) + return -EFAULT; + /* Valid IDs are 0..INT_MAX */ + if ((unsigned int)req_id > INT_MAX) + return -EINVAL; + } + /* * Add the timer to the hash table. The timer is not yet valid - * because new_timer::it_signal is still NULL. The timer id is also - * not yet visible to user space. + * after insertion, but has a unique ID allocated. */ - new_timer_id = posix_timer_add(new_timer); + new_timer_id = posix_timer_add(new_timer, req_id); if (new_timer_id < 0) { posixtimer_free_timer(new_timer); return new_timer_id; } - new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; new_timer->it_overrun = -1LL; if (event) { - rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(event)); - rcu_read_unlock(); + scoped_guard (rcu) + new_timer->it_pid = get_pid(good_sigevent(event)); if (!new_timer->it_pid) { error = -EINVAL; goto out; @@ -434,7 +526,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; new_timer->sigq.info.si_signo = SIGALRM; - memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t)); new_timer->sigq.info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } @@ -453,7 +544,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, } /* * After succesful copy out, the timer ID is visible to user space - * now but not yet valid because new_timer::signal is still NULL. + * now but not yet valid because new_timer::signal low order bit is 1. * * Complete the initialization with the clock specific create * callback. @@ -462,14 +553,25 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (error) goto out; - spin_lock_irq(¤t->sighand->siglock); - /* This makes the timer valid in the hash table */ - WRITE_ONCE(new_timer->it_signal, current->signal); - hlist_add_head(&new_timer->list, ¤t->signal->posix_timers); - spin_unlock_irq(¤t->sighand->siglock); /* - * After unlocking sighand::siglock @new_timer is subject to - * concurrent removal and cannot be touched anymore + * timer::it_lock ensures that __lock_timer() observes a fully + * initialized timer when it observes a valid timer::it_signal. + * + * sighand::siglock is required to protect signal::posix_timers. + */ + scoped_guard (spinlock_irq, &new_timer->it_lock) { + guard(spinlock)(¤t->sighand->siglock); + /* + * new_timer::it_signal contains the signal pointer with + * bit 0 set, which makes it invalid for syscall operations. + * Store the unmodified signal pointer to make it valid. + */ + WRITE_ONCE(new_timer->it_signal, current->signal); + hlist_add_head_rcu(&new_timer->list, ¤t->signal->posix_timers); + } + /* + * After unlocking @new_timer is subject to concurrent removal and + * cannot be touched anymore */ return 0; out: @@ -507,7 +609,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *__lock_timer(timer_t timer_id) { struct k_itimer *timr; @@ -522,11 +624,11 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * The hash lookup and the timers are RCU protected. * * Timers are added to the hash in invalid state where - * timr::it_signal == NULL. timer::it_signal is only set after the - * rest of the initialization succeeded. + * timr::it_signal is marked invalid. timer::it_signal is only set + * after the rest of the initialization succeeded. * * Timer destruction happens in steps: - * 1) Set timr::it_signal to NULL with timr::it_lock held + * 1) Set timr::it_signal marked invalid with timr::it_lock held * 2) Release timr::it_lock * 3) Remove from the hash under hash_lock * 4) Put the reference count. @@ -543,25 +645,21 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * * The lookup validates locklessly that timr::it_signal == * current::it_signal and timr::it_id == @timer_id. timr::it_id - * can't change, but timr::it_signal becomes NULL during - * destruction. + * can't change, but timr::it_signal can become invalid during + * destruction, which makes the locked check fail. */ - rcu_read_lock(); + guard(rcu)(); timr = posix_timer_by_id(timer_id); if (timr) { - spin_lock_irqsave(&timr->it_lock, *flags); + spin_lock_irq(&timr->it_lock); /* * Validate under timr::it_lock that timr::it_signal is * still valid. Pairs with #1 above. */ - if (timr->it_signal == current->signal) { - rcu_read_unlock(); + if (timr->it_signal == current->signal) return timr; - } - spin_unlock_irqrestore(&timr->it_lock, *flags); + spin_unlock_irq(&timr->it_lock); } - rcu_read_unlock(); - return NULL; } @@ -652,24 +750,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int ret = 0; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - memset(setting, 0, sizeof(*setting)); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_get)) - ret = -EINVAL; - else - kc->timer_get(timr, setting); - - unlock_timer(timr, flags); - return ret; + scoped_timer_get_or_fail(timer_id) + scoped_timer->kclock->timer_get(scoped_timer, setting); + return 0; } /* Get the time remaining on a POSIX.1b interval timer. */ @@ -723,18 +807,8 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { - struct k_itimer *timr; - unsigned long flags; - int overrun; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - overrun = timer_overrun_to_int(timr); - unlock_timer(timr, flags); - - return overrun; + scoped_timer_get_or_fail(timer_id) + return timer_overrun_to_int(scoped_timer); } static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, @@ -747,7 +821,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, /* * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the - * hood. See hrtimer_init(). Update timr->kclock, so the generic + * hood. See hrtimer_setup(). Update timr->kclock, so the generic * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might @@ -756,8 +830,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, if (timr->it_clock == CLOCK_REALTIME) timr->kclock = absolute ? &clock_realtime : &clock_monotonic; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; + hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) expires = ktime_add_safe(expires, timer->base->get_time()); @@ -791,26 +864,13 @@ static void common_timer_wait_running(struct k_itimer *timer) * when the task which tries to delete or disarm the timer has preempted * the task which runs the expiry in task work context. */ -static struct k_itimer *timer_wait_running(struct k_itimer *timer, - unsigned long *flags) +static void timer_wait_running(struct k_itimer *timer) { - const struct k_clock *kc = READ_ONCE(timer->kclock); - timer_t timer_id = READ_ONCE(timer->it_id); - - /* Prevent kfree(timer) after dropping the lock */ - rcu_read_lock(); - unlock_timer(timer, *flags); - /* * kc->timer_wait_running() might drop RCU lock. So @timer * cannot be touched anymore after the function returns! */ - if (!WARN_ON_ONCE(!kc->timer_wait_running)) - kc->timer_wait_running(timer); - - rcu_read_unlock(); - /* Relock the timer. It might be not longer hashed. */ - return lock_timer(timer_id, flags); + timer->kclock->timer_wait_running(timer); } /* @@ -865,15 +925,9 @@ int common_timer_set(struct k_itimer *timr, int flags, return 0; } -static int do_timer_settime(timer_t timer_id, int tmr_flags, - struct itimerspec64 *new_spec64, +static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, struct itimerspec64 *old_spec64) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int error; - if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) return -EINVAL; @@ -881,33 +935,28 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); - timr = lock_timer(timer_id, &flags); -retry: - if (!timr) - return -EINVAL; + for (; ; old_spec64 = NULL) { + struct k_itimer *timr; - if (old_spec64) - old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + scoped_timer_get_or_fail(timer_id) { + timr = scoped_timer; - /* Prevent signal delivery and rearming. */ - timr->it_signal_seq++; + if (old_spec64) + old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_set)) - error = -EINVAL; - else - error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); - - if (error == TIMER_RETRY) { - // We already got the old time... - old_spec64 = NULL; - /* Unlocks and relocks the timer if it still exists */ - timr = timer_wait_running(timr, &flags); - goto retry; - } - unlock_timer(timr, flags); + /* Prevent signal delivery and rearming. */ + timr->it_signal_seq++; - return error; + int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64); + if (ret != TIMER_RETRY) + return ret; + + /* Protect the timer from being freed when leaving the lock scope */ + rcu_read_lock(); + } + timer_wait_running(timr); + rcu_read_unlock(); + } } /* Set a POSIX.1b interval timer */ @@ -978,110 +1027,58 @@ static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr) } } -static inline int timer_delete_hook(struct k_itimer *timer) +static void posix_timer_delete(struct k_itimer *timer) { - const struct k_clock *kc = timer->kclock; - - /* Prevent signal delivery and rearming. */ + /* + * Invalidate the timer, remove it from the linked list and remove + * it from the ignored list if pending. + * + * The invalidation must be written with siglock held so that the + * signal code observes the invalidated timer::it_signal in + * do_sigaction(), which prevents it from moving a pending signal + * of a deleted timer to the ignore list. + * + * The invalidation also prevents signal queueing, signal delivery + * and therefore rearming from the signal delivery path. + * + * A concurrent lookup can still find the timer in the hash, but it + * will check timer::it_signal with timer::it_lock held and observe + * bit 0 set, which invalidates it. That also prevents the timer ID + * from being handed out before this timer is completely gone. + */ timer->it_signal_seq++; - if (WARN_ON_ONCE(!kc || !kc->timer_del)) - return -EINVAL; - return kc->timer_del(timer); + scoped_guard (spinlock, ¤t->sighand->siglock) { + unsigned long sig = (unsigned long)timer->it_signal | 1UL; + + WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig); + hlist_del_rcu(&timer->list); + posix_timer_cleanup_ignored(timer); + } + + while (timer->kclock->timer_del(timer) == TIMER_RETRY) { + guard(rcu)(); + spin_unlock_irq(&timer->it_lock); + timer_wait_running(timer); + spin_lock_irq(&timer->it_lock); + } } /* Delete a POSIX.1b interval timer. */ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) { struct k_itimer *timer; - unsigned long flags; - timer = lock_timer(timer_id, &flags); - -retry_delete: - if (!timer) - return -EINVAL; - - if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { - /* Unlocks and relocks the timer if it still exists */ - timer = timer_wait_running(timer, &flags); - goto retry_delete; + scoped_timer_get_or_fail(timer_id) { + timer = scoped_timer; + posix_timer_delete(timer); } - - spin_lock(¤t->sighand->siglock); - hlist_del(&timer->list); - posix_timer_cleanup_ignored(timer); - /* - * A concurrent lookup could check timer::it_signal lockless. It - * will reevaluate with timer::it_lock held and observe the NULL. - * - * It must be written with siglock held so that the signal code - * observes timer->it_signal == NULL in do_sigaction(SIG_IGN), - * which prevents it from moving a pending signal of a deleted - * timer to the ignore list. - */ - WRITE_ONCE(timer->it_signal, NULL); - spin_unlock(¤t->sighand->siglock); - - unlock_timer(timer, flags); + /* Remove it from the hash, which frees up the timer ID */ posix_timer_unhash_and_free(timer); return 0; } /* - * Delete a timer if it is armed, remove it from the hash and schedule it - * for RCU freeing. - */ -static void itimer_delete(struct k_itimer *timer) -{ - unsigned long flags; - - /* - * irqsave is required to make timer_wait_running() work. - */ - spin_lock_irqsave(&timer->it_lock, flags); - -retry_delete: - /* - * Even if the timer is not longer accessible from other tasks - * it still might be armed and queued in the underlying timer - * mechanism. Worse, that timer mechanism might run the expiry - * function concurrently. - */ - if (timer_delete_hook(timer) == TIMER_RETRY) { - /* - * Timer is expired concurrently, prevent livelocks - * and pointless spinning on RT. - * - * timer_wait_running() drops timer::it_lock, which opens - * the possibility for another task to delete the timer. - * - * That's not possible here because this is invoked from - * do_exit() only for the last thread of the thread group. - * So no other task can access and delete that timer. - */ - if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) - return; - - goto retry_delete; - } - hlist_del(&timer->list); - - posix_timer_cleanup_ignored(timer); - - /* - * Setting timer::it_signal to NULL is technically not required - * here as nothing can access the timer anymore legitimately via - * the hash table. Set it to NULL nevertheless so that all deletion - * paths are consistent. - */ - WRITE_ONCE(timer->it_signal, NULL); - - spin_unlock_irqrestore(&timer->it_lock, flags); - posix_timer_unhash_and_free(timer); -} - -/* * Invoked from do_exit() when the last thread of a thread group exits. * At that point no other task can access the timers of the dying * task anymore. @@ -1089,18 +1086,26 @@ retry_delete: void exit_itimers(struct task_struct *tsk) { struct hlist_head timers; + struct hlist_node *next; + struct k_itimer *timer; + + /* Clear restore mode for exec() */ + tsk->signal->timer_create_restore_ids = 0; if (hlist_empty(&tsk->signal->posix_timers)) return; /* Protect against concurrent read via /proc/$PID/timers */ - spin_lock_irq(&tsk->sighand->siglock); - hlist_move_list(&tsk->signal->posix_timers, &timers); - spin_unlock_irq(&tsk->sighand->siglock); + scoped_guard (spinlock_irq, &tsk->sighand->siglock) + hlist_move_list(&tsk->signal->posix_timers, &timers); /* The timers are not longer accessible via tsk::signal */ - while (!hlist_empty(&timers)) - itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); + hlist_for_each_entry_safe(timer, next, &timers, list) { + scoped_guard (spinlock_irq, &timer->it_lock) + posix_timer_delete(timer); + posix_timer_unhash_and_free(timer); + cond_resched(); + } /* * There should be no timers on the ignored list. itimer_delete() has @@ -1545,3 +1550,26 @@ static const struct k_clock *clockid_to_kclock(const clockid_t id) return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } + +static int __init posixtimer_init(void) +{ + unsigned long i, size; + unsigned int shift; + + if (IS_ENABLED(CONFIG_BASE_SMALL)) + size = 512; + else + size = roundup_pow_of_two(512 * num_possible_cpus()); + + timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets), + size, 0, 0, &shift, NULL, size, size); + size = 1UL << shift; + timer_hashmask = size - 1; + + for (i = 0; i < size; i++) { + spin_lock_init(&timer_buckets[i].lock); + INIT_HLIST_HEAD(&timer_buckets[i].head); + } + return 0; +} +core_initcall(posixtimer_init); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fcca4e72f1ef..cc15fe293719 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -263,8 +263,7 @@ void __init generic_sched_clock_init(void) * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - sched_clock_timer.function = sched_clock_poll; + hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index e28f9210f8a1..a88b72b0f35e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -100,7 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) void tick_setup_hrtimer_broadcast(void) { - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - bctimer.function = bc_handler; + hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa058510af9c..c527b421c865 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1573,12 +1573,10 @@ void tick_setup_sched_timer(bool hrtimer) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) { + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) tick_sched_flag_set(ts, TS_FLAG_HIGHRES); - ts->sched_timer.function = tick_nohz_handler; - } /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1e67d076f195..929846b8b45a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -682,20 +682,19 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act } /** - * timekeeping_forward_now - update clock to the current time + * timekeeping_forward - update clock to given cycle now value * @tk: Pointer to the timekeeper to update + * @cycle_now: Current clocksource read value * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void timekeeping_forward_now(struct timekeeper *tk) +static void timekeeping_forward(struct timekeeper *tk, u64 cycle_now) { - u64 cycle_now, delta; + u64 delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); - cycle_now = tk_clock_read(&tk->tkr_mono); - delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, - tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; @@ -711,6 +710,21 @@ static void timekeeping_forward_now(struct timekeeper *tk) } /** + * timekeeping_forward_now - update clock to the current time + * @tk: Pointer to the timekeeper to update + * + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. + */ +static void timekeeping_forward_now(struct timekeeper *tk) +{ + u64 cycle_now = tk_clock_read(&tk->tkr_mono); + + timekeeping_forward(tk, cycle_now); +} + +/** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * @@ -2151,6 +2165,54 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, return offset; } +static u64 timekeeping_accumulate(struct timekeeper *tk, u64 offset, + enum timekeeping_adv_mode mode, + unsigned int *clock_set) +{ + int shift = 0, maxshift; + + /* + * TK_ADV_FREQ indicates that adjtimex(2) directly set the + * frequency or the tick length. + * + * Accumulate the offset, so that the new multiplier starts from + * now. This is required as otherwise for offsets, which are + * smaller than tk::cycle_interval, timekeeping_adjust() could set + * xtime_nsec backwards, which subsequently causes time going + * backwards in the coarse time getters. But even for the case + * where offset is greater than tk::cycle_interval the periodic + * accumulation does not have much value. + * + * Also reset tk::ntp_error as it does not make sense to keep the + * old accumulated error around in this case. + */ + if (mode == TK_ADV_FREQ) { + timekeeping_forward(tk, tk->tkr_mono.cycle_last + offset); + tk->ntp_error = 0; + return 0; + } + + /* + * With NO_HZ we may have to accumulate many cycle_intervals + * (think "ticks") worth of time at once. To do this efficiently, + * we calculate the largest doubling multiple of cycle_intervals + * that is smaller than the offset. We then accumulate that + * chunk in one go, and then try to consume the next smaller + * doubled multiple. + */ + shift = ilog2(offset) - ilog2(tk->cycle_interval); + shift = max(0, shift); + /* Bound shift to one less than what overflows tick_length */ + maxshift = (64 - (ilog2(ntp_tick_length()) + 1)) - 1; + shift = min(shift, maxshift); + while (offset >= tk->cycle_interval) { + offset = logarithmic_accumulation(tk, offset, shift, clock_set); + if (offset < tk->cycle_interval << shift) + shift--; + } + return offset; +} + /* * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length @@ -2160,7 +2222,6 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) struct timekeeper *tk = &tk_core.shadow_timekeeper; struct timekeeper *real_tk = &tk_core.timekeeper; unsigned int clock_set = 0; - int shift = 0, maxshift; u64 offset; guard(raw_spinlock_irqsave)(&tk_core.lock); @@ -2177,24 +2238,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; - /* - * With NO_HZ we may have to accumulate many cycle_intervals - * (think "ticks") worth of time at once. To do this efficiently, - * we calculate the largest doubling multiple of cycle_intervals - * that is smaller than the offset. We then accumulate that - * chunk in one go, and then try to consume the next smaller - * doubled multiple. - */ - shift = ilog2(offset) - ilog2(tk->cycle_interval); - shift = max(0, shift); - /* Bound shift to one less than what overflows tick_length */ - maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; - shift = min(shift, maxshift); - while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift, &clock_set); - if (offset < tk->cycle_interval<<shift) - shift--; - } + offset = timekeeping_accumulate(tk, offset, mode, &clock_set); /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 1c311c46da50..cfbb46cc4e76 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,7 +46,7 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", @@ -98,7 +98,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 9cb9b6584ea1..2f6330831f08 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1675,6 +1675,9 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) } while (i < tmigr_hierarchy_levels); + /* Assert single root */ + WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])); + while (i > 0) { group = stack[--i]; @@ -1716,7 +1719,12 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) WARN_ON_ONCE(top == 0); lvllist = &tmigr_level_list[top]; - if (group->num_children == 1 && list_is_singular(lvllist)) { + + /* + * Newly created root level should have accounted the upcoming + * CPU's child group and pre-accounted the old root. + */ + if (group->num_children == 2 && list_is_singular(lvllist)) { /* * The target CPU must never do the prepare work, except * on early boot when the boot CPU is the target. Otherwise diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 05d383143165..01c2ab1e8971 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,29 +15,29 @@ #include "timekeeping_internal.h" -static inline void update_vdso_data(struct vdso_data *vdata, - struct timekeeper *tk) +static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; + vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; + vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; #endif - vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; + vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask; + vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult; + vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift; + vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; + vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; #endif - vdata[CS_RAW].mask = tk->tkr_raw.mask; - vdata[CS_RAW].mult = tk->tkr_raw.mult; - vdata[CS_RAW].shift = tk->tkr_raw.shift; + vc[CS_RAW].mask = tk->tkr_raw.mask; + vc[CS_RAW].mult = tk->tkr_raw.mult; + vc[CS_RAW].shift = tk->tkr_raw.shift; /* CLOCK_MONOTONIC */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; @@ -55,7 +55,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { @@ -65,19 +65,20 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; @@ -86,21 +87,21 @@ void update_vsyscall(struct timekeeper *tk) vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; - vdata[CS_HRES_COARSE].clock_mode = clock_mode; - vdata[CS_RAW].clock_mode = clock_mode; + vc[CS_HRES_COARSE].clock_mode = clock_mode; + vc[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; /* CLOCK_MONOTONIC_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec = nsec + tk->wall_to_monotonic.tv_nsec; @@ -108,32 +109,31 @@ void update_vsyscall(struct timekeeper *tk) /* * Read without the seqlock held by clock_getres(). - * Note: No need to have a second copy. */ - WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) - update_vdso_data(vdata, tk); + update_vdso_time_data(vdata, tk); __arch_update_vsyscall(vdata); vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } void update_vsyscall_tz(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + vdata->tz_minuteswest = sys_tz.tz_minuteswest; + vdata->tz_dsttime = sys_tz.tz_dsttime; - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } /** @@ -150,7 +150,7 @@ void update_vsyscall_tz(void) */ unsigned long vdso_update_begin(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; unsigned long flags = timekeeper_lock_irqsave(); vdso_write_begin(vdata); @@ -167,9 +167,9 @@ unsigned long vdso_update_begin(void) */ void vdso_update_end(unsigned long flags) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); timekeeper_unlock_irqrestore(flags); } diff --git a/kernel/torture.c b/kernel/torture.c index dede150aef01..3a0a8cc60401 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -792,6 +792,8 @@ static void torture_stutter_cleanup(void) stutter_task = NULL; } +static unsigned long torture_init_jiffies; + static void torture_print_module_parms(void) { @@ -821,6 +823,7 @@ bool torture_init_begin(char *ttype, int v) torture_type = ttype; verbose = v; fullstop = FULLSTOP_DONTSTOP; + WRITE_ONCE(torture_init_jiffies, jiffies); // Lockless reads. torture_print_module_parms(); return true; } @@ -837,6 +840,15 @@ void torture_init_end(void) EXPORT_SYMBOL_GPL(torture_init_end); /* + * Get the torture_init_begin()-time value of the jiffies counter. + */ +unsigned long get_torture_init_jiffies(void) +{ + return READ_ONCE(torture_init_jiffies); +} +EXPORT_SYMBOL_GPL(get_torture_init_jiffies); + +/* * Clean up torture module. Please note that this is -not- invoked via * the usual module_exit() mechanism, but rather by an explicit call from * the client torture module. Returns true if a race with system shutdown diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d570b8b9c0a9..033fba0633cf 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -263,6 +263,18 @@ config FUNCTION_GRAPH_RETADDR the function is called. This feature is off by default, and you can enable it via the trace option funcgraph-retaddr. +config FUNCTION_TRACE_ARGS + bool + depends on HAVE_FUNCTION_ARG_ACCESS_API + depends on DEBUG_INFO_BTF + default y + help + If supported with function argument access API and BTF, then + the function tracer and function graph tracer will support printing + of function arguments. This feature is off by default, and can be + enabled via the trace option func-args (for the function tracer) and + funcgraph-args (for the function graph tracer) + config DYNAMIC_FTRACE bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index adc947587eb8..187dc37d61d4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -392,7 +392,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .arg2_type = ARG_CONST_SIZE, }; -static void __set_printk_clr_event(void) +static void __set_printk_clr_event(struct work_struct *work) { /* * This program might be calling bpf_trace_printk, @@ -405,10 +405,11 @@ static void __set_printk_clr_event(void) if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1)) pr_warn_ratelimited("could not enable bpf_trace_printk events"); } +static DECLARE_WORK(set_printk_work, __set_printk_clr_event); const struct bpf_func_proto *bpf_get_trace_printk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_printk_proto; } @@ -451,7 +452,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = { const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void) { - __set_printk_clr_event(); + schedule_work(&set_printk_work); return &bpf_trace_vprintk_proto; } @@ -606,6 +607,11 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; +const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void) +{ + return &bpf_perf_event_read_value_proto; +} + static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_raw_record *raw, @@ -843,7 +849,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc if (unlikely(is_global_init(task))) return -EPERM; - if (!preemptible()) { + if (preempt_count() != 0 || irqs_disabled()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ @@ -1038,27 +1044,14 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_X86_KERNEL_IBT -static unsigned long get_entry_ip(unsigned long fentry_ip) +static inline unsigned long get_entry_ip(unsigned long fentry_ip) { - u32 instr; - - /* We want to be extra safe in case entry ip is on the page edge, - * but otherwise we need to avoid get_kernel_nofault()'s overhead. - */ - if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { - if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) - return fentry_ip; - } else { - instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); - } - if (is_endbr(instr)) +#ifdef CONFIG_X86_KERNEL_IBT + if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; +#endif return fentry_ip; } -#else -#define get_entry_ip(fentry_ip) fentry_ip -#endif BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { @@ -2345,10 +2338,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address((unsigned long)btp); module_put(mod); - preempt_enable(); } static __always_inline @@ -2932,18 +2924,21 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3 u32 i, err = 0; for (i = 0; i < addrs_cnt; i++) { + bool skip_add = false; struct module *mod; - preempt_disable(); - mod = __module_address(addrs[i]); - /* Either no module or we it's already stored */ - if (!mod || has_module(&arr, mod)) { - preempt_enable(); - continue; + scoped_guard(rcu) { + mod = __module_address(addrs[i]); + /* Either no module or it's already stored */ + if (!mod || has_module(&arr, mod)) { + skip_add = true; + break; /* scoped_guard */ + } + if (!try_module_get(mod)) + err = -EINVAL; } - if (!try_module_get(mod)) - err = -EINVAL; - preempt_enable(); + if (skip_add) + continue; if (err) break; err = add_module(&arr, mod); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 5dddfc2149f6..8d925cbdce3a 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -865,7 +865,7 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe } /* - * After all architecures have selected HAVE_FUNCTION_GRAPH_FREGS, we can + * After all architectures have selected HAVE_FUNCTION_GRAPH_FREGS, we can * leave only ftrace_return_to_handler(fregs). */ #ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 2560b312ad57..33082c4e8154 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -403,13 +403,12 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num) lockdep_assert_held(&fprobe_mutex); fprobe_graph_active--; - if (!fprobe_graph_active) { - /* Q: should we unregister it ? */ + /* Q: should we unregister it ? */ + if (!fprobe_graph_active) unregister_ftrace_graph(&fprobe_graph_ops); - return; - } - ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); + if (num) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); } static int symbols_cmp(const void *a, const void *b) @@ -679,8 +678,7 @@ int unregister_fprobe(struct fprobe *fp) } del_fprobe_hash(fp); - if (count) - fprobe_graph_remove_ips(addrs, count); + fprobe_graph_remove_ips(addrs, count); kfree_rcu(hlist_array, rcu); fp->hlist_array = NULL; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 728ecda6e8d4..92015de6203d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -540,6 +540,7 @@ static int function_stat_show(struct seq_file *m, void *v) static struct trace_seq s; unsigned long long avg; unsigned long long stddev; + unsigned long long stddev_denom; #endif guard(mutex)(&ftrace_profile_lock); @@ -559,23 +560,19 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - /* Sample standard deviation (s^2) */ - if (rec->counter <= 1) - stddev = 0; - else { - /* - * Apply Welford's method: - * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) - */ + /* + * Variance formula: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + * Maybe Welford's method is better here? + * Divide only by 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide by 1000 again. + */ + stddev = 0; + stddev_denom = rec->counter * (rec->counter - 1) * 1000; + if (stddev_denom) { stddev = rec->counter * rec->time_squared - rec->time * rec->time; - - /* - * Divide only 1000 for ns^2 -> us^2 conversion. - * trace_print_graph_duration will divide 1000 again. - */ - stddev = div64_ul(stddev, - rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, stddev_denom); } trace_seq_init(&s); @@ -1296,6 +1293,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) void ftrace_free_filter(struct ftrace_ops *ops) { ftrace_ops_init(ops); + if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) + return; free_ftrace_hash(ops->func_hash->filter_hash); free_ftrace_hash(ops->func_hash->notrace_hash); } @@ -3220,15 +3219,22 @@ static struct ftrace_hash *copy_hash(struct ftrace_hash *src) * The filter_hash updates uses just the append_hash() function * and the notrace_hash does not. */ -static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash) +static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash, + int size_bits) { struct ftrace_func_entry *entry; int size; int i; - /* An empty hash does everything */ - if (ftrace_hash_empty(*hash)) - return 0; + if (*hash) { + /* An empty hash does everything */ + if (ftrace_hash_empty(*hash)) + return 0; + } else { + *hash = alloc_ftrace_hash(size_bits); + if (!*hash) + return -ENOMEM; + } /* If new_hash has everything make hash have everything */ if (ftrace_hash_empty(new_hash)) { @@ -3292,16 +3298,18 @@ static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_has /* Return a new hash that has a union of all @ops->filter_hash entries */ static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) { - struct ftrace_hash *new_hash; + struct ftrace_hash *new_hash = NULL; struct ftrace_ops *subops; + int size_bits; int ret; - new_hash = alloc_ftrace_hash(ops->func_hash->filter_hash->size_bits); - if (!new_hash) - return NULL; + if (ops->func_hash->filter_hash) + size_bits = ops->func_hash->filter_hash->size_bits; + else + size_bits = FTRACE_HASH_DEFAULT_BITS; list_for_each_entry(subops, &ops->subop_list, list) { - ret = append_hash(&new_hash, subops->func_hash->filter_hash); + ret = append_hash(&new_hash, subops->func_hash->filter_hash, size_bits); if (ret < 0) { free_ftrace_hash(new_hash); return NULL; @@ -3310,7 +3318,8 @@ static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) if (ftrace_hash_empty(new_hash)) break; } - return new_hash; + /* Can't return NULL as that means this failed */ + return new_hash ? : EMPTY_HASH; } /* Make @ops trace evenything except what all its subops do not trace */ @@ -3505,7 +3514,8 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash); if (!filter_hash) return -ENOMEM; - ret = append_hash(&filter_hash, subops->func_hash->filter_hash); + ret = append_hash(&filter_hash, subops->func_hash->filter_hash, + size_bits); if (ret < 0) { free_ftrace_hash(filter_hash); return ret; @@ -5707,6 +5717,9 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return -ENOENT; free_hash_entry(hash, entry); return 0; + } else if (__ftrace_lookup_ip(hash, ip) != NULL) { + /* Already exists */ + return 0; } entry = add_hash_entry(hash, ip); @@ -7005,6 +7018,7 @@ static int ftrace_process_locs(struct module *mod, unsigned long *p; unsigned long addr; unsigned long flags = 0; /* Shut up gcc */ + unsigned long pages; int ret = -ENOMEM; count = end - start; @@ -7012,6 +7026,8 @@ static int ftrace_process_locs(struct module *mod, if (!count) return 0; + pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); + /* * Sorting mcount in vmlinux at build time depend on * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in @@ -7056,7 +7072,9 @@ static int ftrace_process_locs(struct module *mod, pg = start_pg; while (p < end) { unsigned long end_offset; - addr = ftrace_call_adjust(*p++); + + addr = *p++; + /* * Some architecture linkers will pad between * the different mcount_loc sections of different @@ -7068,6 +7086,19 @@ static int ftrace_process_locs(struct module *mod, continue; } + /* + * If this is core kernel, make sure the address is in core + * or inittext, as weak functions get zeroed and KASLR can + * move them to something other than zero. It just will not + * move it to an area where kernel text is. + */ + if (!mod && !(is_kernel_text(addr) || is_kernel_inittext(addr))) { + skipped++; + continue; + } + + addr = ftrace_call_adjust(addr); + end_offset = (pg->index+1) * sizeof(pg->records[0]); if (end_offset > PAGE_SIZE << pg->order) { /* We should have allocated enough */ @@ -7107,11 +7138,41 @@ static int ftrace_process_locs(struct module *mod, /* We should have used all pages unless we skipped some */ if (pg_unuse) { - WARN_ON(!skipped); + unsigned long pg_remaining, remaining = 0; + unsigned long skip; + + /* Count the number of entries unused and compare it to skipped. */ + pg_remaining = (ENTRIES_PER_PAGE << pg->order) - pg->index; + + if (!WARN(skipped < pg_remaining, "Extra allocated pages for ftrace")) { + + skip = skipped - pg_remaining; + + for (pg = pg_unuse; pg; pg = pg->next) + remaining += 1 << pg->order; + + pages -= remaining; + + skip = DIV_ROUND_UP(skip, ENTRIES_PER_PAGE); + + /* + * Check to see if the number of pages remaining would + * just fit the number of entries skipped. + */ + WARN(skip != remaining, "Extra allocated pages for ftrace: %lu with %lu skipped", + remaining, skipped); + } /* Need to synchronize with ftrace_location_range() */ synchronize_rcu(); ftrace_free_pages(pg_unuse); } + + if (!mod) { + count -= skipped; + pr_info("ftrace: allocating %ld entries in %ld pages\n", + count, pages); + } + return ret; } @@ -7757,9 +7818,6 @@ void __init ftrace_init(void) goto failed; } - pr_info("ftrace: allocating %ld entries in %ld pages\n", - count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE)); - ret = ftrace_process_locs(NULL, __start_mcount_loc, __stop_mcount_loc); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b8e0ae15ca5b..9d4d951090d3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1672,7 +1672,8 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) * must be the same. */ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, - struct trace_buffer *buffer, int nr_pages) + struct trace_buffer *buffer, int nr_pages, + unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; @@ -1680,6 +1681,9 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, unsigned long buffers_end; int i; + if (!subbuf_mask) + return false; + /* Check the meta magic and meta struct size */ if (meta->magic != RING_BUFFER_META_MAGIC || meta->struct_size != sizeof(*meta)) { @@ -1712,6 +1716,8 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, subbuf = rb_subbufs_from_meta(meta); + bitmap_clear(subbuf_mask, 0, meta->nr_subbufs); + /* Is the meta buffers and the subbufs themselves have correct data? */ for (i = 0; i < meta->nr_subbufs; i++) { if (meta->buffers[i] < 0 || @@ -1725,6 +1731,12 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, return false; } + if (test_bit(meta->buffers[i], subbuf_mask)) { + pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu); + return false; + } + + set_bit(meta->buffers[i], subbuf_mask); subbuf = (void *)subbuf + subbuf_size; } @@ -1838,6 +1850,11 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->cpu); goto invalid; } + + /* If the buffer has content, update pages_touched */ + if (ret) + local_inc(&cpu_buffer->pages_touched); + entries += ret; entry_bytes += local_read(&head_page->page->commit); local_set(&cpu_buffer->head_page->entries, ret); @@ -1889,17 +1906,22 @@ static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; + unsigned long *subbuf_mask; unsigned long delta; void *subbuf; int cpu; int i; + /* Create a mask to test the subbuf array */ + subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); + /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); - if (rb_meta_valid(meta, cpu, buffer, nr_pages)) { + if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; @@ -1943,6 +1965,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf += meta->subbuf_size; } } + bitmap_free(subbuf_mask); } static void *rbm_start(struct seq_file *m, loff_t *pos) @@ -5295,7 +5318,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * moving it. The page before the header page has the * flag bit '1' set if it is pointing to the page we want. * but if the writer is in the process of moving it - * than it will be '2' or already moved '0'. + * then it will be '2' or already moved '0'. */ ret = rb_head_page_replace(reader, cpu_buffer->reader_page); @@ -7126,6 +7149,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, kfree(cpu_buffer->subbuf_ids); cpu_buffer->subbuf_ids = NULL; rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); } unlock: diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 8226352a0062..b39f36013ef2 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -27,6 +27,13 @@ menuconfig RV source "kernel/trace/rv/monitors/wip/Kconfig" source "kernel/trace/rv/monitors/wwnr/Kconfig" +source "kernel/trace/rv/monitors/sched/Kconfig" +source "kernel/trace/rv/monitors/tss/Kconfig" +source "kernel/trace/rv/monitors/sco/Kconfig" +source "kernel/trace/rv/monitors/snroc/Kconfig" +source "kernel/trace/rv/monitors/scpd/Kconfig" +source "kernel/trace/rv/monitors/snep/Kconfig" +source "kernel/trace/rv/monitors/sncid/Kconfig" # Add new monitors here config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 188b64668e1f..f9b2cd0483c3 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -5,6 +5,13 @@ ccflags-y += -I $(src) # needed for trace events obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o +obj-$(CONFIG_RV_MON_TSS) += monitors/tss/tss.o +obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o +obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o +obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o +obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o +obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig new file mode 100644 index 000000000000..ae3eb410abd7 --- /dev/null +++ b/kernel/trace/rv/monitors/sched/Kconfig @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCHED + depends on RV + bool "sched monitor" + help + Collection of monitors to check the scheduler behaves according to specifications. + Enable this to enable all scheduler specification supported by the current kernel. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c new file mode 100644 index 000000000000..905e03c3c934 --- /dev/null +++ b/kernel/trace/rv/monitors/sched/sched.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> + +#define MODULE_NAME "sched" + +#include "sched.h" + +struct rv_monitor rv_sched; + +struct rv_monitor rv_sched = { + .name = "sched", + .description = "container for several scheduler monitor specifications.", + .enable = NULL, + .disable = NULL, + .reset = NULL, + .enabled = 0, +}; + +static int __init register_sched(void) +{ + rv_register_monitor(&rv_sched, NULL); + return 0; +} + +static void __exit unregister_sched(void) +{ + rv_unregister_monitor(&rv_sched); +} + +module_init(register_sched); +module_exit(unregister_sched); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sched: container for several scheduler monitor specifications."); diff --git a/kernel/trace/rv/monitors/sched/sched.h b/kernel/trace/rv/monitors/sched/sched.h new file mode 100644 index 000000000000..ba148dd8d48b --- /dev/null +++ b/kernel/trace/rv/monitors/sched/sched.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +extern struct rv_monitor rv_sched; diff --git a/kernel/trace/rv/monitors/sco/Kconfig b/kernel/trace/rv/monitors/sco/Kconfig new file mode 100644 index 000000000000..097c96cccdd7 --- /dev/null +++ b/kernel/trace/rv/monitors/sco/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCO + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sco monitor" + help + Monitor to ensure sched_set_state happens only in thread context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c new file mode 100644 index 000000000000..4cff59220bfc --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sco" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sco.h" + +static struct rv_monitor rv_sco; +DECLARE_DA_MON_PER_CPU(sco, unsigned char); + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + da_handle_start_event_sco(sched_set_state_sco); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_sco(schedule_entry_sco); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_sco(schedule_exit_sco); +} + +static int enable_sco(void) +{ + int retval; + + retval = da_monitor_init_sco(); + if (retval) + return retval; + + rv_attach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("sco", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sco", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_sco(void) +{ + rv_sco.enabled = 0; + + rv_detach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("sco", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sco", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_sco(); +} + +static struct rv_monitor rv_sco = { + .name = "sco", + .description = "scheduling context operations.", + .enable = enable_sco, + .disable = disable_sco, + .reset = da_monitor_reset_all_sco, + .enabled = 0, +}; + +static int __init register_sco(void) +{ + rv_register_monitor(&rv_sco, &rv_sched); + return 0; +} + +static void __exit unregister_sco(void) +{ + rv_unregister_monitor(&rv_sco); +} + +module_init(register_sco); +module_exit(unregister_sco); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sco: scheduling context operations."); diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h new file mode 100644 index 000000000000..7a4c1f2d5ca1 --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sco automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sco { + thread_context_sco = 0, + scheduling_context_sco, + state_max_sco +}; + +#define INVALID_STATE state_max_sco + +enum events_sco { + sched_set_state_sco = 0, + schedule_entry_sco, + schedule_exit_sco, + event_max_sco +}; + +struct automaton_sco { + char *state_names[state_max_sco]; + char *event_names[event_max_sco]; + unsigned char function[state_max_sco][event_max_sco]; + unsigned char initial_state; + bool final_states[state_max_sco]; +}; + +static const struct automaton_sco automaton_sco = { + .state_names = { + "thread_context", + "scheduling_context" + }, + .event_names = { + "sched_set_state", + "schedule_entry", + "schedule_exit" + }, + .function = { + { thread_context_sco, scheduling_context_sco, INVALID_STATE }, + { INVALID_STATE, INVALID_STATE, thread_context_sco }, + }, + .initial_state = thread_context_sco, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sco/sco_trace.h b/kernel/trace/rv/monitors/sco/sco_trace.h new file mode 100644 index 000000000000..b711cd9024ec --- /dev/null +++ b/kernel/trace/rv/monitors/sco/sco_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SCO +DEFINE_EVENT(event_da_monitor, event_sco, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_sco, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SCO */ diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig new file mode 100644 index 000000000000..b9114fbf680f --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SCPD + depends on RV + depends on PREEMPT_TRACER + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "scpd monitor" + help + Monitor to ensure schedule is called with preemption disabled. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c new file mode 100644 index 000000000000..cbdd6a5f8d7f --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "scpd" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "scpd.h" + +static struct rv_monitor rv_scpd; +DECLARE_DA_MON_PER_CPU(scpd, unsigned char); + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_scpd(preempt_disable_scpd); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_scpd(preempt_enable_scpd); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_scpd(schedule_entry_scpd); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_event_scpd(schedule_exit_scpd); +} + +static int enable_scpd(void) +{ + int retval; + + retval = da_monitor_init_scpd(); + if (retval) + return retval; + + rv_attach_trace_probe("scpd", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("scpd", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_scpd(void) +{ + rv_scpd.enabled = 0; + + rv_detach_trace_probe("scpd", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("scpd", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_scpd(); +} + +static struct rv_monitor rv_scpd = { + .name = "scpd", + .description = "schedule called with preemption disabled.", + .enable = enable_scpd, + .disable = disable_scpd, + .reset = da_monitor_reset_all_scpd, + .enabled = 0, +}; + +static int __init register_scpd(void) +{ + rv_register_monitor(&rv_scpd, &rv_sched); + return 0; +} + +static void __exit unregister_scpd(void) +{ + rv_unregister_monitor(&rv_scpd); +} + +module_init(register_scpd); +module_exit(unregister_scpd); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("scpd: schedule called with preemption disabled."); diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h new file mode 100644 index 000000000000..295f735a5811 --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of scpd automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_scpd { + cant_sched_scpd = 0, + can_sched_scpd, + state_max_scpd +}; + +#define INVALID_STATE state_max_scpd + +enum events_scpd { + preempt_disable_scpd = 0, + preempt_enable_scpd, + schedule_entry_scpd, + schedule_exit_scpd, + event_max_scpd +}; + +struct automaton_scpd { + char *state_names[state_max_scpd]; + char *event_names[event_max_scpd]; + unsigned char function[state_max_scpd][event_max_scpd]; + unsigned char initial_state; + bool final_states[state_max_scpd]; +}; + +static const struct automaton_scpd automaton_scpd = { + .state_names = { + "cant_sched", + "can_sched" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "schedule_entry", + "schedule_exit" + }, + .function = { + { can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE }, + { INVALID_STATE, cant_sched_scpd, can_sched_scpd, can_sched_scpd }, + }, + .initial_state = cant_sched_scpd, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/scpd/scpd_trace.h b/kernel/trace/rv/monitors/scpd/scpd_trace.h new file mode 100644 index 000000000000..6b0f4aa4732e --- /dev/null +++ b/kernel/trace/rv/monitors/scpd/scpd_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SCPD +DEFINE_EVENT(event_da_monitor, event_scpd, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_scpd, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SCPD */ diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/sncid/Kconfig new file mode 100644 index 000000000000..76bcfef4fd10 --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNCID + depends on RV + depends on IRQSOFF_TRACER + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sncid monitor" + help + Monitor to ensure schedule is not called with interrupt disabled. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c new file mode 100644 index 000000000000..f5037cd6214c --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/sncid.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "sncid" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "sncid.h" + +static struct rv_monitor rv_sncid; +DECLARE_DA_MON_PER_CPU(sncid, unsigned char); + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_sncid(irq_disable_sncid); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_sncid(irq_enable_sncid); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_start_event_sncid(schedule_entry_sncid); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_sncid(schedule_exit_sncid); +} + +static int enable_sncid(void) +{ + int retval; + + retval = da_monitor_init_sncid(); + if (retval) + return retval; + + rv_attach_trace_probe("sncid", irq_disable, handle_irq_disable); + rv_attach_trace_probe("sncid", irq_enable, handle_irq_enable); + rv_attach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_sncid(void) +{ + rv_sncid.enabled = 0; + + rv_detach_trace_probe("sncid", irq_disable, handle_irq_disable); + rv_detach_trace_probe("sncid", irq_enable, handle_irq_enable); + rv_detach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_sncid(); +} + +static struct rv_monitor rv_sncid = { + .name = "sncid", + .description = "schedule not called with interrupt disabled.", + .enable = enable_sncid, + .disable = disable_sncid, + .reset = da_monitor_reset_all_sncid, + .enabled = 0, +}; + +static int __init register_sncid(void) +{ + rv_register_monitor(&rv_sncid, &rv_sched); + return 0; +} + +static void __exit unregister_sncid(void) +{ + rv_unregister_monitor(&rv_sncid); +} + +module_init(register_sncid); +module_exit(unregister_sncid); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("sncid: schedule not called with interrupt disabled."); diff --git a/kernel/trace/rv/monitors/sncid/sncid.h b/kernel/trace/rv/monitors/sncid/sncid.h new file mode 100644 index 000000000000..21304725142b --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/sncid.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sncid automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sncid { + can_sched_sncid = 0, + cant_sched_sncid, + state_max_sncid +}; + +#define INVALID_STATE state_max_sncid + +enum events_sncid { + irq_disable_sncid = 0, + irq_enable_sncid, + schedule_entry_sncid, + schedule_exit_sncid, + event_max_sncid +}; + +struct automaton_sncid { + char *state_names[state_max_sncid]; + char *event_names[event_max_sncid]; + unsigned char function[state_max_sncid][event_max_sncid]; + unsigned char initial_state; + bool final_states[state_max_sncid]; +}; + +static const struct automaton_sncid automaton_sncid = { + .state_names = { + "can_sched", + "cant_sched" + }, + .event_names = { + "irq_disable", + "irq_enable", + "schedule_entry", + "schedule_exit" + }, + .function = { + { cant_sched_sncid, INVALID_STATE, can_sched_sncid, can_sched_sncid }, + { INVALID_STATE, can_sched_sncid, INVALID_STATE, INVALID_STATE }, + }, + .initial_state = can_sched_sncid, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sncid/sncid_trace.h b/kernel/trace/rv/monitors/sncid/sncid_trace.h new file mode 100644 index 000000000000..3ce42a57671d --- /dev/null +++ b/kernel/trace/rv/monitors/sncid/sncid_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNCID +DEFINE_EVENT(event_da_monitor, event_sncid, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_sncid, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SNCID */ diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig new file mode 100644 index 000000000000..77527f971232 --- /dev/null +++ b/kernel/trace/rv/monitors/snep/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNEP + depends on RV + depends on PREEMPT_TRACER + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "snep monitor" + help + Monitor to ensure schedule does not enable preempt. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c new file mode 100644 index 000000000000..0076ba6d7ea4 --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "snep" + +#include <trace/events/sched.h> +#include <trace/events/preemptirq.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "snep.h" + +static struct rv_monitor rv_snep; +DECLARE_DA_MON_PER_CPU(snep, unsigned char); + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_snep(preempt_disable_snep); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_start_event_snep(preempt_enable_snep); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_snep(schedule_entry_snep); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_snep(schedule_exit_snep); +} + +static int enable_snep(void) +{ + int retval; + + retval = da_monitor_init_snep(); + if (retval) + return retval; + + rv_attach_trace_probe("snep", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("snep", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("snep", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("snep", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_snep(void) +{ + rv_snep.enabled = 0; + + rv_detach_trace_probe("snep", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("snep", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("snep", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("snep", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_snep(); +} + +static struct rv_monitor rv_snep = { + .name = "snep", + .description = "schedule does not enable preempt.", + .enable = enable_snep, + .disable = disable_snep, + .reset = da_monitor_reset_all_snep, + .enabled = 0, +}; + +static int __init register_snep(void) +{ + rv_register_monitor(&rv_snep, &rv_sched); + return 0; +} + +static void __exit unregister_snep(void) +{ + rv_unregister_monitor(&rv_snep); +} + +module_init(register_snep); +module_exit(unregister_snep); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("snep: schedule does not enable preempt."); diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h new file mode 100644 index 000000000000..6d16b9ad931e --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of snep automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_snep { + non_scheduling_context_snep = 0, + scheduling_contex_snep, + state_max_snep +}; + +#define INVALID_STATE state_max_snep + +enum events_snep { + preempt_disable_snep = 0, + preempt_enable_snep, + schedule_entry_snep, + schedule_exit_snep, + event_max_snep +}; + +struct automaton_snep { + char *state_names[state_max_snep]; + char *event_names[event_max_snep]; + unsigned char function[state_max_snep][event_max_snep]; + unsigned char initial_state; + bool final_states[state_max_snep]; +}; + +static const struct automaton_snep automaton_snep = { + .state_names = { + "non_scheduling_context", + "scheduling_contex" + }, + .event_names = { + "preempt_disable", + "preempt_enable", + "schedule_entry", + "schedule_exit" + }, + .function = { + { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, INVALID_STATE }, + { INVALID_STATE, INVALID_STATE, INVALID_STATE, non_scheduling_context_snep }, + }, + .initial_state = non_scheduling_context_snep, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/snep/snep_trace.h b/kernel/trace/rv/monitors/snep/snep_trace.h new file mode 100644 index 000000000000..01aad49a949a --- /dev/null +++ b/kernel/trace/rv/monitors/snep/snep_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNEP +DEFINE_EVENT(event_da_monitor, event_snep, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_snep, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_SNEP */ diff --git a/kernel/trace/rv/monitors/snroc/Kconfig b/kernel/trace/rv/monitors/snroc/Kconfig new file mode 100644 index 000000000000..6e4365a2fea3 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SNROC + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_ID + bool "snroc monitor" + help + Monitor to ensure sched_set_state happens only in the respective task's context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c new file mode 100644 index 000000000000..bb1f60d55296 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "snroc" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "snroc.h" + +static struct rv_monitor rv_snroc; +DECLARE_DA_MON_PER_TASK(snroc, unsigned char); + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + da_handle_event_snroc(tsk, sched_set_state_snroc); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_start_event_snroc(prev, sched_switch_out_snroc); + da_handle_event_snroc(next, sched_switch_in_snroc); +} + +static int enable_snroc(void) +{ + int retval; + + retval = da_monitor_init_snroc(); + if (retval) + return retval; + + rv_attach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("snroc", sched_switch, handle_sched_switch); + + return 0; +} + +static void disable_snroc(void) +{ + rv_snroc.enabled = 0; + + rv_detach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("snroc", sched_switch, handle_sched_switch); + + da_monitor_destroy_snroc(); +} + +static struct rv_monitor rv_snroc = { + .name = "snroc", + .description = "set non runnable on its own context.", + .enable = enable_snroc, + .disable = disable_snroc, + .reset = da_monitor_reset_all_snroc, + .enabled = 0, +}; + +static int __init register_snroc(void) +{ + rv_register_monitor(&rv_snroc, &rv_sched); + return 0; +} + +static void __exit unregister_snroc(void) +{ + rv_unregister_monitor(&rv_snroc); +} + +module_init(register_snroc); +module_exit(unregister_snroc); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("snroc: set non runnable on its own context."); diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h new file mode 100644 index 000000000000..c3650a2b1b10 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of snroc automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_snroc { + other_context_snroc = 0, + own_context_snroc, + state_max_snroc +}; + +#define INVALID_STATE state_max_snroc + +enum events_snroc { + sched_set_state_snroc = 0, + sched_switch_in_snroc, + sched_switch_out_snroc, + event_max_snroc +}; + +struct automaton_snroc { + char *state_names[state_max_snroc]; + char *event_names[event_max_snroc]; + unsigned char function[state_max_snroc][event_max_snroc]; + unsigned char initial_state; + bool final_states[state_max_snroc]; +}; + +static const struct automaton_snroc automaton_snroc = { + .state_names = { + "other_context", + "own_context" + }, + .event_names = { + "sched_set_state", + "sched_switch_in", + "sched_switch_out" + }, + .function = { + { INVALID_STATE, own_context_snroc, INVALID_STATE }, + { own_context_snroc, INVALID_STATE, other_context_snroc }, + }, + .initial_state = other_context_snroc, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/snroc/snroc_trace.h b/kernel/trace/rv/monitors/snroc/snroc_trace.h new file mode 100644 index 000000000000..50114cef5122 --- /dev/null +++ b/kernel/trace/rv/monitors/snroc/snroc_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SNROC +DEFINE_EVENT(event_da_monitor_id, event_snroc, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_snroc, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_SNROC */ diff --git a/kernel/trace/rv/monitors/tss/Kconfig b/kernel/trace/rv/monitors/tss/Kconfig new file mode 100644 index 000000000000..479f86f52e60 --- /dev/null +++ b/kernel/trace/rv/monitors/tss/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_TSS + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "tss monitor" + help + Monitor to ensure sched_switch happens only in scheduling context. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c new file mode 100644 index 000000000000..542787e6524f --- /dev/null +++ b/kernel/trace/rv/monitors/tss/tss.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/ftrace.h> +#include <linux/tracepoint.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rv.h> +#include <rv/instrumentation.h> +#include <rv/da_monitor.h> + +#define MODULE_NAME "tss" + +#include <trace/events/sched.h> +#include <rv_trace.h> +#include <monitors/sched/sched.h> + +#include "tss.h" + +static struct rv_monitor rv_tss; +DECLARE_DA_MON_PER_CPU(tss, unsigned char); + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_event_tss(sched_switch_tss); +} + +static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +{ + da_handle_event_tss(schedule_entry_tss); +} + +static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +{ + da_handle_start_event_tss(schedule_exit_tss); +} + +static int enable_tss(void) +{ + int retval; + + retval = da_monitor_init_tss(); + if (retval) + return retval; + + rv_attach_trace_probe("tss", sched_switch, handle_sched_switch); + rv_attach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); + + return 0; +} + +static void disable_tss(void) +{ + rv_tss.enabled = 0; + + rv_detach_trace_probe("tss", sched_switch, handle_sched_switch); + rv_detach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); + + da_monitor_destroy_tss(); +} + +static struct rv_monitor rv_tss = { + .name = "tss", + .description = "task switch while scheduling.", + .enable = enable_tss, + .disable = disable_tss, + .reset = da_monitor_reset_all_tss, + .enabled = 0, +}; + +static int __init register_tss(void) +{ + rv_register_monitor(&rv_tss, &rv_sched); + return 0; +} + +static void __exit unregister_tss(void) +{ + rv_unregister_monitor(&rv_tss); +} + +module_init(register_tss); +module_exit(unregister_tss); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>"); +MODULE_DESCRIPTION("tss: task switch while scheduling."); diff --git a/kernel/trace/rv/monitors/tss/tss.h b/kernel/trace/rv/monitors/tss/tss.h new file mode 100644 index 000000000000..f0a36fda1b87 --- /dev/null +++ b/kernel/trace/rv/monitors/tss/tss.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of tss automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_tss { + thread_tss = 0, + sched_tss, + state_max_tss +}; + +#define INVALID_STATE state_max_tss + +enum events_tss { + sched_switch_tss = 0, + schedule_entry_tss, + schedule_exit_tss, + event_max_tss +}; + +struct automaton_tss { + char *state_names[state_max_tss]; + char *event_names[event_max_tss]; + unsigned char function[state_max_tss][event_max_tss]; + unsigned char initial_state; + bool final_states[state_max_tss]; +}; + +static const struct automaton_tss automaton_tss = { + .state_names = { + "thread", + "sched" + }, + .event_names = { + "sched_switch", + "schedule_entry", + "schedule_exit" + }, + .function = { + { INVALID_STATE, sched_tss, INVALID_STATE }, + { sched_tss, INVALID_STATE, thread_tss }, + }, + .initial_state = thread_tss, + .final_states = { 1, 0 }, +}; diff --git a/kernel/trace/rv/monitors/tss/tss_trace.h b/kernel/trace/rv/monitors/tss/tss_trace.h new file mode 100644 index 000000000000..4619dbb50cc0 --- /dev/null +++ b/kernel/trace/rv/monitors/tss/tss_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_TSS +DEFINE_EVENT(event_da_monitor, event_tss, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_tss, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_TSS */ diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig index 3ef664b5cd90..e464b9294865 100644 --- a/kernel/trace/rv/monitors/wip/Kconfig +++ b/kernel/trace/rv/monitors/wip/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +# config RV_MON_WIP depends on RV depends on PREEMPT_TRACER diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index db7389157c87..ed758fec8608 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -71,7 +71,7 @@ static struct rv_monitor rv_wip = { static int __init register_wip(void) { - rv_register_monitor(&rv_wip); + rv_register_monitor(&rv_wip, NULL); return 0; } diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h index 2e373f2c65ed..c7193748bf36 100644 --- a/kernel/trace/rv/monitors/wip/wip.h +++ b/kernel/trace/rv/monitors/wip/wip.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Automatically generated C representation of wip automaton * For further information about this format, see kernel documentation: diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig index ee741aa6d6b8..d3bfc20037db 100644 --- a/kernel/trace/rv/monitors/wwnr/Kconfig +++ b/kernel/trace/rv/monitors/wwnr/Kconfig @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +# config RV_MON_WWNR depends on RV select DA_MON_EVENTS_ID diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 3b16994a9984..172f31c4b0f3 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -70,7 +70,7 @@ static struct rv_monitor rv_wwnr = { static int __init register_wwnr(void) { - rv_register_monitor(&rv_wwnr); + rv_register_monitor(&rv_wwnr, NULL); return 0; } diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h index d0d9c4b8121b..0a59d23edf61 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.h +++ b/kernel/trace/rv/monitors/wwnr/wwnr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Automatically generated C representation of wwnr automaton * For further information about this format, see kernel documentation: diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 8657fc8806e7..50344aa9f7f9 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -162,7 +162,7 @@ struct dentry *get_monitors_root(void) /* * Interface for the monitor register. */ -static LIST_HEAD(rv_monitors_list); +LIST_HEAD(rv_monitors_list); static int task_monitor_count; static bool task_monitor_slots[RV_PER_TASK_MONITORS]; @@ -207,6 +207,30 @@ void rv_put_task_monitor_slot(int slot) } /* + * Monitors with a parent are nested, + * Monitors without a parent could be standalone or containers. + */ +bool rv_is_nested_monitor(struct rv_monitor_def *mdef) +{ + return mdef->parent != NULL; +} + +/* + * We set our list to have nested monitors listed after their parent + * if a monitor has a child element its a container. + * Containers can be also identified based on their function pointers: + * as they are not real monitors they do not need function definitions + * for enable()/disable(). Use this condition to find empty containers. + * Keep both conditions in case we have some non-compliant containers. + */ +bool rv_is_container_monitor(struct rv_monitor_def *mdef) +{ + struct rv_monitor_def *next = list_next_entry(mdef, list); + + return next->parent == mdef->monitor || !mdef->monitor->enable; +} + +/* * This section collects the monitor/ files and folders. */ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, @@ -229,7 +253,8 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) if (mdef->monitor->enabled) { mdef->monitor->enabled = 0; - mdef->monitor->disable(); + if (mdef->monitor->disable) + mdef->monitor->disable(); /* * Wait for the execution of all events to finish. @@ -243,6 +268,60 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) return 0; } +static void rv_disable_single(struct rv_monitor_def *mdef) +{ + __rv_disable_monitor(mdef, true); +} + +static int rv_enable_single(struct rv_monitor_def *mdef) +{ + int retval; + + lockdep_assert_held(&rv_interface_lock); + + if (mdef->monitor->enabled) + return 0; + + retval = mdef->monitor->enable(); + + if (!retval) + mdef->monitor->enabled = 1; + + return retval; +} + +static void rv_disable_container(struct rv_monitor_def *mdef) +{ + struct rv_monitor_def *p = mdef; + int enabled = 0; + + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (p->parent != mdef->monitor) + break; + enabled += __rv_disable_monitor(p, false); + } + if (enabled) + tracepoint_synchronize_unregister(); + mdef->monitor->enabled = 0; +} + +static int rv_enable_container(struct rv_monitor_def *mdef) +{ + struct rv_monitor_def *p = mdef; + int retval = 0; + + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (retval || p->parent != mdef->monitor) + break; + retval = rv_enable_single(p); + } + if (retval) + rv_disable_container(mdef); + else + mdef->monitor->enabled = 1; + return retval; +} + /** * rv_disable_monitor - disable a given runtime monitor * @mdef: Pointer to the monitor definition structure. @@ -251,7 +330,11 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) */ int rv_disable_monitor(struct rv_monitor_def *mdef) { - __rv_disable_monitor(mdef, true); + if (rv_is_container_monitor(mdef)) + rv_disable_container(mdef); + else + rv_disable_single(mdef); + return 0; } @@ -265,15 +348,10 @@ int rv_enable_monitor(struct rv_monitor_def *mdef) { int retval; - lockdep_assert_held(&rv_interface_lock); - - if (mdef->monitor->enabled) - return 0; - - retval = mdef->monitor->enable(); - - if (!retval) - mdef->monitor->enabled = 1; + if (rv_is_container_monitor(mdef)) + retval = rv_enable_container(mdef); + else + retval = rv_enable_single(mdef); return retval; } @@ -336,9 +414,9 @@ static const struct file_operations interface_desc_fops = { * the monitor dir, where the specific options of the monitor * are exposed. */ -static int create_monitor_dir(struct rv_monitor_def *mdef) +static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent) { - struct dentry *root = get_monitors_root(); + struct dentry *root = parent ? parent->root_d : get_monitors_root(); const char *name = mdef->monitor->name; struct dentry *tmp; int retval; @@ -377,7 +455,11 @@ static int monitors_show(struct seq_file *m, void *p) { struct rv_monitor_def *mon_def = p; - seq_printf(m, "%s\n", mon_def->monitor->name); + if (mon_def->parent) + seq_printf(m, "%s:%s\n", mon_def->parent->name, + mon_def->monitor->name); + else + seq_printf(m, "%s\n", mon_def->monitor->name); return 0; } @@ -514,7 +596,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user struct rv_monitor_def *mdef; int retval = -EINVAL; bool enable = true; - char *ptr; + char *ptr, *tmp; int len; if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) @@ -541,6 +623,11 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user retval = -EINVAL; + /* we support 1 nesting level, trim the parent */ + tmp = strstr(ptr, ":"); + if (tmp) + ptr = tmp+1; + list_for_each_entry(mdef, &rv_monitors_list, list) { if (strcmp(ptr, mdef->monitor->name) != 0) continue; @@ -613,7 +700,7 @@ static void reset_all_monitors(void) struct rv_monitor_def *mdef; list_for_each_entry(mdef, &rv_monitors_list, list) { - if (mdef->monitor->enabled) + if (mdef->monitor->enabled && mdef->monitor->reset) mdef->monitor->reset(); } } @@ -685,18 +772,19 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef) /** * rv_register_monitor - register a rv monitor. * @monitor: The rv_monitor to be registered. + * @parent: The parent of the monitor to be registered, NULL if not nested. * * Returns 0 if successful, error otherwise. */ -int rv_register_monitor(struct rv_monitor *monitor) +int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) { - struct rv_monitor_def *r; + struct rv_monitor_def *r, *p = NULL; int retval = 0; if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { pr_info("Monitor %s has a name longer than %d\n", monitor->name, MAX_RV_MONITOR_NAME_SIZE); - return -1; + return -EINVAL; } mutex_lock(&rv_interface_lock); @@ -704,11 +792,26 @@ int rv_register_monitor(struct rv_monitor *monitor) list_for_each_entry(r, &rv_monitors_list, list) { if (strcmp(monitor->name, r->monitor->name) == 0) { pr_info("Monitor %s is already registered\n", monitor->name); - retval = -1; + retval = -EEXIST; goto out_unlock; } } + if (parent) { + list_for_each_entry(r, &rv_monitors_list, list) { + if (strcmp(parent->name, r->monitor->name) == 0) { + p = r; + break; + } + } + } + + if (p && rv_is_nested_monitor(p)) { + pr_info("Parent monitor %s is already nested, cannot nest further\n", + parent->name); + return -EINVAL; + } + r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); if (!r) { retval = -ENOMEM; @@ -716,14 +819,19 @@ int rv_register_monitor(struct rv_monitor *monitor) } r->monitor = monitor; + r->parent = parent; - retval = create_monitor_dir(r); + retval = create_monitor_dir(r, p); if (retval) { kfree(r); goto out_unlock; } - list_add_tail(&r->list, &rv_monitors_list); + /* keep children close to the parent for easier visualisation */ + if (p) + list_add(&r->list, &p->list); + else + list_add_tail(&r->list, &rv_monitors_list); out_unlock: mutex_unlock(&rv_interface_lock); diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index db6cb0913dbd..98fca0a1adbc 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -21,6 +21,7 @@ struct rv_interface { #define MAX_RV_REACTOR_NAME_SIZE 32 extern struct mutex rv_interface_lock; +extern struct list_head rv_monitors_list; #ifdef CONFIG_RV_REACTORS struct rv_reactor_def { @@ -34,6 +35,7 @@ struct rv_reactor_def { struct rv_monitor_def { struct list_head list; struct rv_monitor *monitor; + struct rv_monitor *parent; struct dentry *root_d; #ifdef CONFIG_RV_REACTORS struct rv_reactor_def *rdef; @@ -45,6 +47,8 @@ struct rv_monitor_def { struct dentry *get_monitors_root(void); int rv_disable_monitor(struct rv_monitor_def *mdef); int rv_enable_monitor(struct rv_monitor_def *mdef); +bool rv_is_container_monitor(struct rv_monitor_def *mdef); +bool rv_is_nested_monitor(struct rv_monitor_def *mdef); #ifdef CONFIG_RV_REACTORS int reactor_populate_monitor(struct rv_monitor_def *mdef); diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 7b49cbe388d4..9501ca886d83 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -158,8 +158,9 @@ static const struct seq_operations monitor_reactors_seq_ops = { .show = monitor_reactor_show }; -static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef, - bool reacting) +static void monitor_swap_reactors_single(struct rv_monitor_def *mdef, + struct rv_reactor_def *rdef, + bool reacting, bool nested) { bool monitor_enabled; @@ -179,10 +180,31 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor mdef->reacting = reacting; mdef->monitor->react = rdef->reactor->react; - if (monitor_enabled) + /* enable only once if iterating through a container */ + if (monitor_enabled && !nested) rv_enable_monitor(mdef); } +static void monitor_swap_reactors(struct rv_monitor_def *mdef, + struct rv_reactor_def *rdef, bool reacting) +{ + struct rv_monitor_def *p = mdef; + + if (rv_is_container_monitor(mdef)) + list_for_each_entry_continue(p, &rv_monitors_list, list) { + if (p->parent != mdef->monitor) + break; + monitor_swap_reactors_single(p, rdef, reacting, true); + } + /* + * This call enables and disables the monitor if they were active. + * In case of a container, we already disabled all and will enable all. + * All nested monitors are enabled also if they were off, we may refine + * this logic in the future. + */ + monitor_swap_reactors_single(mdef, rdef, reacting, false); +} + static ssize_t monitor_reactors_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 5e65097423ba..422b75f58891 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -58,6 +58,11 @@ DECLARE_EVENT_CLASS(error_da_monitor, ); #include <monitors/wip/wip_trace.h> +#include <monitors/tss/tss_trace.h> +#include <monitors/sco/sco_trace.h> +#include <monitors/scpd/scpd_trace.h> +#include <monitors/snep/snep_trace.h> +#include <monitors/sncid/sncid_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ @@ -118,6 +123,7 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, ); #include <monitors/wwnr/wwnr_trace.h> +#include <monitors/snroc/snroc_trace.h> // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here #endif /* CONFIG_DA_MON_EVENTS_ID */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1496a5ac33ae..826267f5b650 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -87,6 +87,7 @@ void __init disable_tracing_selftest(const char *reason) static struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; static bool tracepoint_printk_stop_on_boot __initdata; +static bool traceoff_after_boot __initdata; static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); /* For tracers that don't implement custom flags */ @@ -330,6 +331,13 @@ static int __init set_tracepoint_printk_stop(char *str) } __setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop); +static int __init set_traceoff_after_boot(char *str) +{ + traceoff_after_boot = true; + return 1; +} +__setup("traceoff_after_boot", set_traceoff_after_boot); + unsigned long long ns2usecs(u64 nsec) { nsec += 500; @@ -2878,13 +2886,16 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, void trace_function(struct trace_array *tr, unsigned long ip, unsigned long - parent_ip, unsigned int trace_ctx) + parent_ip, unsigned int trace_ctx, struct ftrace_regs *fregs) { struct trace_buffer *buffer = tr->array_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; + int size = sizeof(*entry); + + size += FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long); - event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), + event = __trace_buffer_lock_reserve(buffer, TRACE_FN, size, trace_ctx); if (!event) return; @@ -2892,6 +2903,13 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long entry->ip = ip; entry->parent_ip = parent_ip; +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + if (static_branch_unlikely(&trace_function_exports_enabled)) ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); @@ -4100,12 +4118,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) entries, total, buf->cpu, - preempt_model_none() ? "server" : - preempt_model_voluntary() ? "desktop" : - preempt_model_full() ? "preempt" : - preempt_model_lazy() ? "lazy" : - preempt_model_rt() ? "preempt_rt" : - "unknown", + preempt_model_str(), /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP @@ -5977,8 +5990,6 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id) { - int ret; - guard(mutex)(&trace_types_lock); if (cpu_id != RING_BUFFER_ALL_CPUS) { @@ -5987,11 +5998,7 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr, return -EINVAL; } - ret = __tracing_resize_ring_buffer(tr, size, cpu_id); - if (ret < 0) - ret = -ENOMEM; - - return ret; + return __tracing_resize_ring_buffer(tr, size, cpu_id); } static void update_last_data(struct trace_array *tr) @@ -8285,6 +8292,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; + /* Currently the boot mapped buffer is not supported for mmap */ + if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) + return -ENODEV; + ret = get_snapshot_map(iter->tr); if (ret) return ret; @@ -10706,6 +10717,9 @@ __init static int late_trace_init(void) tracepoint_printk = 0; } + if (traceoff_after_boot) + tracing_off(); + tracing_set_default_clock(); clear_boot_tracer(); return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9c21ba45b7af..4a6621e2a0fa 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -21,6 +21,7 @@ #include <linux/workqueue.h> #include <linux/ctype.h> #include <linux/once_lite.h> +#include <linux/ftrace_regs.h> #include "pid_list.h" @@ -697,7 +698,8 @@ unsigned long trace_total_entries(struct trace_array *tr); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, - unsigned int trace_ctx); + unsigned int trace_ctx, + struct ftrace_regs *regs); void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, @@ -897,6 +899,7 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_RETVAL 0x800 #define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000 #define TRACE_GRAPH_PRINT_RETADDR 0x2000 +#define TRACE_GRAPH_ARGS 0x4000 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -1714,7 +1717,7 @@ struct event_trigger_data { unsigned long count; int ref; int flags; - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; char *filter_str; @@ -1959,7 +1962,7 @@ struct event_command { int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); - struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); + const struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; /** diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index fbfb396905a6..ee40d4e6ad1c 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -61,8 +61,9 @@ FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, F_STRUCT( - __field_fn( unsigned long, ip ) - __field_fn( unsigned long, parent_ip ) + __field_fn( unsigned long, ip ) + __field_fn( unsigned long, parent_ip ) + __dynamic_array( unsigned long, args ) ), F_printk(" %ps <-- %ps", @@ -72,17 +73,18 @@ FTRACE_ENTRY_REG(function, ftrace_entry, ); /* Function call entry */ -FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, TRACE_GRAPH_ENT, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( int, graph_ent, depth ) + __field_packed( unsigned long, graph_ent, depth ) + __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth) + F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth) ); #ifdef CONFIG_FUNCTION_GRAPH_RETADDR diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 82fd637cfc19..c08355c3ef32 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -478,7 +478,7 @@ static void eprobe_trigger_func(struct event_trigger_data *data, __eprobe_trace_func(edata, rec); } -static struct event_trigger_ops eprobe_trigger_ops = { +static const struct event_trigger_ops eprobe_trigger_ops = { .trigger = eprobe_trigger_func, .print = eprobe_trigger_print, .init = eprobe_trigger_init, @@ -507,8 +507,8 @@ static void eprobe_trigger_unreg_func(char *glob, } -static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, - char *param) +static const struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd, + char *param) { return &eprobe_trigger_ops; } @@ -913,6 +913,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } if (argc - 2 > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); ret = -E2BIG; goto error; } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 3ff9caa4a71b..a6bb7577e8c5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -49,7 +49,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event)) { - ret = perf_allow_tracepoint(&p_event->attr); + ret = perf_allow_tracepoint(); if (ret) return ret; @@ -86,7 +86,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, * ...otherwise raw tracepoint data can be a severe data leak, * only allow root to have these. */ - ret = perf_allow_tracepoint(&p_event->attr); + ret = perf_allow_tracepoint(); if (ret) return ret; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4cb275316e51..8e7603acca21 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -790,7 +790,9 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags); } - call->class->reg(call, TRACE_REG_UNREGISTER, file); + ret = call->class->reg(call, TRACE_REG_UNREGISTER, file); + + WARN_ON_ONCE(ret); } /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ if (file->flags & EVENT_FILE_FL_SOFT_MODE) @@ -1591,6 +1593,13 @@ s_next(struct seq_file *m, void *v, loff_t *pos) return iter; #endif + /* + * The iter is allocated in s_start() and passed via the 'v' + * parameter. To stop the iterator, NULL must be returned. But + * the return value is what the 'v' parameter in s_stop() receives + * and frees. Free iter here as it will no longer be used. + */ + kfree(iter); return NULL; } @@ -1667,9 +1676,9 @@ static int s_show(struct seq_file *m, void *v) } #endif -static void s_stop(struct seq_file *m, void *p) +static void s_stop(struct seq_file *m, void *v) { - kfree(p); + kfree(v); t_stop(m, NULL); } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 261163b00137..1260c23cfa5f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5689,12 +5689,16 @@ static int event_hist_open(struct inode *inode, struct file *file) guard(mutex)(&event_mutex); event_file = event_file_data(file); - if (!event_file) - return -ENODEV; + if (!event_file) { + ret = -ENODEV; + goto err; + } hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL); - if (!hist_file) - return -ENOMEM; + if (!hist_file) { + ret = -ENOMEM; + goto err; + } hist_file->file = file; hist_file->last_act = get_hist_hit_count(event_file); @@ -5702,9 +5706,14 @@ static int event_hist_open(struct inode *inode, struct file *file) /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; ret = single_open(file, hist_show, hist_file); - if (ret) + if (ret) { kfree(hist_file); + goto err; + } + return 0; +err: + tracing_release_file_tr(inode, file); return ret; } @@ -5979,7 +5988,10 @@ static int event_hist_debug_open(struct inode *inode, struct file *file) /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; - return single_open(file, hist_debug_show, file); + ret = single_open(file, hist_debug_show, file); + if (ret) + tracing_release_file_tr(inode, file); + return ret; } const struct file_operations event_hist_debug_fops = { @@ -6191,7 +6203,7 @@ static void event_hist_trigger_free(struct event_trigger_data *data) } } -static struct event_trigger_ops event_hist_trigger_ops = { +static const struct event_trigger_ops event_hist_trigger_ops = { .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_init, @@ -6223,15 +6235,15 @@ static void event_hist_trigger_named_free(struct event_trigger_data *data) } } -static struct event_trigger_ops event_hist_trigger_named_ops = { +static const struct event_trigger_ops event_hist_trigger_named_ops = { .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_named_init, .free = event_hist_trigger_named_free, }; -static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, - char *param) +static const struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd, + char *param) { return &event_hist_trigger_ops; } @@ -6724,27 +6736,27 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (existing_hist_update_only(glob, trigger_data, file)) goto out_free; - ret = event_trigger_register(cmd_ops, file, glob, trigger_data); - if (ret < 0) - goto out_free; + if (!get_named_trigger_data(trigger_data)) { - if (get_named_trigger_data(trigger_data)) - goto enable; + ret = create_actions(hist_data); + if (ret) + goto out_free; - ret = create_actions(hist_data); - if (ret) - goto out_unreg; + if (has_hist_vars(hist_data) || hist_data->n_var_refs) { + ret = save_hist_vars(hist_data); + if (ret) + goto out_free; + } - if (has_hist_vars(hist_data) || hist_data->n_var_refs) { - ret = save_hist_vars(hist_data); + ret = tracing_map_init(hist_data->map); if (ret) - goto out_unreg; + goto out_free; } - ret = tracing_map_init(hist_data->map); - if (ret) - goto out_unreg; -enable: + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret < 0) + goto out_free; + ret = hist_trigger_enable(trigger_data, file); if (ret) goto out_unreg; @@ -6826,38 +6838,38 @@ hist_enable_count_trigger(struct event_trigger_data *data, hist_enable_trigger(data, buffer, rec, event); } -static struct event_trigger_ops hist_enable_trigger_ops = { +static const struct event_trigger_ops hist_enable_trigger_ops = { .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops hist_enable_count_trigger_ops = { +static const struct event_trigger_ops hist_enable_count_trigger_ops = { .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops hist_disable_trigger_ops = { +static const struct event_trigger_ops hist_disable_trigger_ops = { .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops hist_disable_count_trigger_ops = { +static const struct event_trigger_ops hist_disable_count_trigger_ops = { .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * hist_enable_get_trigger_ops(char *cmd, char *param) { - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; bool enable; enable = (strcmp(cmd, ENABLE_HIST_STR) == 0); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index e3f7d09e5512..969f48742d72 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -207,7 +207,7 @@ static int synth_field_string_size(char *type) if (len == 0) return 0; /* variable-length string */ - strncpy(buf, start, len); + memcpy(buf, start, len); buf[len] = '\0'; err = kstrtouint(buf, 0, &size); @@ -305,7 +305,7 @@ static const char *synth_field_fmt(char *type) else if (strcmp(type, "gfp_t") == 0) fmt = "%x"; else if (synth_field_is_string(type)) - fmt = "%.*s"; + fmt = "%s"; else if (synth_field_is_stack(type)) fmt = "%s"; @@ -612,7 +612,7 @@ static int __set_synth_event_print_fmt(struct synth_event *event, fmt = synth_field_fmt(event->fields[i]->type); pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", event->fields[i]->name, fmt, - i == event->n_fields - 1 ? "" : ", "); + i == event->n_fields - 1 ? "" : " "); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); @@ -852,6 +852,38 @@ static struct trace_event_fields synth_event_fields_array[] = { {} }; +static int synth_event_reg(struct trace_event_call *call, + enum trace_reg type, void *data) +{ + struct synth_event *event = container_of(call, struct synth_event, call); + + switch (type) { +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: +#endif + case TRACE_REG_REGISTER: + if (!try_module_get(event->mod)) + return -EBUSY; + break; + default: + break; + } + + int ret = trace_event_reg(call, type, data); + + switch (type) { +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_UNREGISTER: +#endif + case TRACE_REG_UNREGISTER: + module_put(event->mod); + break; + default: + break; + } + return ret; +} + static int register_synth_event(struct synth_event *event) { struct trace_event_call *call = &event->call; @@ -881,7 +913,7 @@ static int register_synth_event(struct synth_event *event) goto out; } call->flags = TRACE_EVENT_FL_TRACEPOINT; - call->class->reg = trace_event_reg; + call->class->reg = synth_event_reg; call->class->probe = trace_event_raw_event_synth; call->data = event; call->tp = event->tp; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d45448947094..b66b6d235d91 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -825,7 +825,7 @@ struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, void *private_data) { struct event_trigger_data *trigger_data; - struct event_trigger_ops *trigger_ops; + const struct event_trigger_ops *trigger_ops; trigger_ops = cmd_ops->get_trigger_ops(cmd, param); @@ -1367,38 +1367,38 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops traceon_trigger_ops = { +static const struct event_trigger_ops traceon_trigger_ops = { .trigger = traceon_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops traceon_count_trigger_ops = { +static const struct event_trigger_ops traceon_count_trigger_ops = { .trigger = traceon_count_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops traceoff_trigger_ops = { +static const struct event_trigger_ops traceoff_trigger_ops = { .trigger = traceoff_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops traceoff_count_trigger_ops = { +static const struct event_trigger_ops traceoff_count_trigger_ops = { .trigger = traceoff_count_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * onoff_get_trigger_ops(char *cmd, char *param) { - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; /* we register both traceon and traceoff to this callback */ if (strcmp(cmd, "traceon") == 0) @@ -1491,21 +1491,21 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops snapshot_trigger_ops = { +static const struct event_trigger_ops snapshot_trigger_ops = { .trigger = snapshot_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops snapshot_count_trigger_ops = { +static const struct event_trigger_ops snapshot_count_trigger_ops = { .trigger = snapshot_count_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * snapshot_get_trigger_ops(char *cmd, char *param) { return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; @@ -1586,21 +1586,21 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data) data->filter_str); } -static struct event_trigger_ops stacktrace_trigger_ops = { +static const struct event_trigger_ops stacktrace_trigger_ops = { .trigger = stacktrace_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops stacktrace_count_trigger_ops = { +static const struct event_trigger_ops stacktrace_count_trigger_ops = { .trigger = stacktrace_count_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; -static struct event_trigger_ops * +static const struct event_trigger_ops * stacktrace_get_trigger_ops(char *cmd, char *param) { return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; @@ -1711,28 +1711,28 @@ void event_enable_trigger_free(struct event_trigger_data *data) } } -static struct event_trigger_ops event_enable_trigger_ops = { +static const struct event_trigger_ops event_enable_trigger_ops = { .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops event_enable_count_trigger_ops = { +static const struct event_trigger_ops event_enable_count_trigger_ops = { .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops event_disable_trigger_ops = { +static const struct event_trigger_ops event_disable_trigger_ops = { .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; -static struct event_trigger_ops event_disable_count_trigger_ops = { +static const struct event_trigger_ops event_disable_count_trigger_ops = { .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, @@ -1916,10 +1916,10 @@ void event_enable_unregister_trigger(char *glob, data->ops->free(data); } -static struct event_trigger_ops * +static const struct event_trigger_ops * event_enable_get_trigger_ops(char *cmd, char *param) { - struct event_trigger_ops *ops; + const struct event_trigger_ops *ops; bool enable; #ifdef CONFIG_HIST_TRIGGERS diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 97325fbd6283..af42aaa3d172 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -455,7 +455,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work) if (ret && ret != -ENOENT) { struct user_event *user = enabler->event; - pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n", + pr_warn("user_events: Fault for mm: 0x%p @ 0x%llx event: %s\n", mm->mm, (unsigned long long)uaddr, EVENT_NAME(user)); } @@ -2793,11 +2793,8 @@ static int user_seq_show(struct seq_file *m, void *p) seq_printf(m, "%s", EVENT_TP_NAME(user)); - if (status != 0) - seq_puts(m, " #"); - if (status != 0) { - seq_puts(m, " Used by"); + seq_puts(m, " # Used by"); if (status & EVENT_STATUS_FTRACE) seq_puts(m, " ftrace"); if (status & EVENT_STATUS_PERF) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index b8f3c4ba309b..5d7ca80173ea 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -920,13 +920,8 @@ static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mo if (!data->tpoint && !strcmp(data->tp_name, tp->name)) { data->tpoint = tp; - if (!data->mod) { + if (!data->mod) data->mod = mod; - if (!try_module_get(data->mod)) { - data->tpoint = NULL; - data->mod = NULL; - } - } } } @@ -938,13 +933,7 @@ static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) data->tpoint = tp; } -/* - * Find a tracepoint from kernel and module. If the tracepoint is in a module, - * this increments the module refcount to prevent unloading until the - * trace_fprobe is registered to the list. After registering the trace_fprobe - * on the trace_fprobe list, the module refcount is decremented because - * tracepoint_probe_module_cb will handle it. - */ +/* Find a tracepoint from kernel and module. */ static struct tracepoint *find_tracepoint(const char *tp_name, struct module **tp_mod) { @@ -973,6 +962,7 @@ static void reenable_trace_fprobe(struct trace_fprobe *tf) } } +/* Find a tracepoint from specified module. */ static struct tracepoint *find_tracepoint_in_module(struct module *mod, const char *tp_name) { @@ -1008,10 +998,13 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, reenable_trace_fprobe(tf); } } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) { - tracepoint_probe_unregister(tf->tpoint, + unregister_fprobe(&tf->fp); + if (trace_fprobe_is_tracepoint(tf)) { + tracepoint_probe_unregister(tf->tpoint, tf->tpoint->probestub, NULL); - tf->tpoint = NULL; - tf->mod = NULL; + tf->tpoint = TRACEPOINT_STUB; + tf->mod = NULL; + } } } mutex_unlock(&event_mutex); @@ -1049,6 +1042,19 @@ static int parse_symbol_and_return(int argc, const char *argv[], if (*is_return) return 0; + if (is_tracepoint) { + tmp = *symbol; + while (*tmp && (isalnum(*tmp) || *tmp == '_')) + tmp++; + if (*tmp) { + /* find a wrong character. */ + trace_probe_log_err(tmp - *symbol, BAD_TP_NAME); + kfree(*symbol); + *symbol = NULL; + return -EINVAL; + } + } + /* If there is $retval, this should be a return fprobe. */ for (i = 2; i < argc; i++) { tmp = strstr(argv[i], "$retval"); @@ -1056,6 +1062,8 @@ static int parse_symbol_and_return(int argc, const char *argv[], if (is_tracepoint) { trace_probe_log_set_index(i); trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE); + kfree(*symbol); + *symbol = NULL; return -EINVAL; } *is_return = true; @@ -1161,6 +1169,11 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (is_tracepoint) { ctx->flags |= TPARG_FL_TPOINT; tpoint = find_tracepoint(symbol, &tp_mod); + /* lock module until register this tprobe. */ + if (tp_mod && !try_module_get(tp_mod)) { + tpoint = NULL; + tp_mod = NULL; + } if (tpoint) { ctx->funcname = kallsyms_lookup( (unsigned long)tpoint->probestub, @@ -1186,8 +1199,11 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], argc = new_argc; argv = new_argv; } - if (argc > MAX_TRACE_ARGS) + if (argc > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) @@ -1215,6 +1231,11 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (is_return && tf->tp.entry_arg) { tf->fp.entry_handler = trace_fprobe_entry_handler; tf->fp.entry_data_size = traceprobe_get_entry_data_size(&tf->tp); + if (ALIGN(tf->fp.entry_data_size, sizeof(long)) > MAX_FPROBE_DATA_SIZE) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_EARGS); + return -E2BIG; + } } ret = traceprobe_set_print_fmt(&tf->tp, diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d358c9935164..98ccf3f00c51 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -25,6 +25,9 @@ static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); static void +function_args_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); static void @@ -42,9 +45,10 @@ enum { TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ TRACE_FUNC_OPT_STACK = 0x1, TRACE_FUNC_OPT_NO_REPEATS = 0x2, + TRACE_FUNC_OPT_ARGS = 0x4, /* Update this to next highest bit. */ - TRACE_FUNC_OPT_HIGHEST_BIT = 0x4 + TRACE_FUNC_OPT_HIGHEST_BIT = 0x8 }; #define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1) @@ -114,6 +118,8 @@ static ftrace_func_t select_trace_function(u32 flags_val) switch (flags_val & TRACE_FUNC_OPT_MASK) { case TRACE_FUNC_NO_OPTS: return function_trace_call; + case TRACE_FUNC_OPT_ARGS: + return function_args_trace_call; case TRACE_FUNC_OPT_STACK: return function_stack_trace_call; case TRACE_FUNC_OPT_NO_REPEATS: @@ -216,11 +222,38 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); - trace_ctx = tracing_gen_ctx(); + trace_ctx = tracing_gen_ctx_dec(); data = this_cpu_ptr(tr->array_buffer.data); if (!atomic_read(&data->disabled)) - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); + + ftrace_test_recursion_unlock(bit); +} + +static void +function_args_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned int trace_ctx; + int bit; + int cpu; + + if (unlikely(!tr->function_enabled)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + trace_ctx = tracing_gen_ctx(); + + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + if (!atomic_read(&data->disabled)) + trace_function(tr, ip, parent_ip, trace_ctx, fregs); ftrace_test_recursion_unlock(bit); } @@ -270,7 +303,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, if (likely(disabled == 1)) { trace_ctx = tracing_gen_ctx_flags(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); #ifdef CONFIG_UNWINDER_FRAME_POINTER if (ftrace_pids_enabled(op)) skip++; @@ -321,7 +354,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned int trace_ctx; - unsigned long flags; int bit; if (unlikely(!tr->function_enabled)) @@ -347,11 +379,10 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (is_repeat_check(tr, last_info, ip, parent_ip)) goto out; - local_save_flags(flags); - trace_ctx = tracing_gen_ctx_flags(flags); + trace_ctx = tracing_gen_ctx_dec(); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); out: ftrace_test_recursion_unlock(bit); @@ -391,7 +422,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_flags(flags); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); __trace_stack(tr, trace_ctx, STACK_SKIP); } @@ -405,6 +436,9 @@ static struct tracer_opt func_opts[] = { { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) }, +#ifdef CONFIG_FUNCTION_TRACE_ARGS + { TRACER_OPT(func-args, TRACE_FUNC_OPT_ARGS) }, +#endif { } /* Always set a last empty entry */ }; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 54d850997c0a..2f077d4158e5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -71,6 +71,10 @@ static struct tracer_opt trace_opts[] = { /* Display function return address ? */ { TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) }, #endif +#ifdef CONFIG_FUNCTION_TRACE_ARGS + /* Display function arguments ? */ + { TRACER_OPT(funcgraph-args, TRACE_GRAPH_ARGS) }, +#endif /* Include sleep time (scheduled out) between entry and return */ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, @@ -110,25 +114,43 @@ static void print_graph_duration(struct trace_array *tr, unsigned long long duration, struct trace_seq *s, u32 flags); -int __trace_graph_entry(struct trace_array *tr, - struct ftrace_graph_ent *trace, - unsigned int trace_ctx) +static int __graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, + unsigned int trace_ctx, struct ftrace_regs *fregs) { struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; struct ftrace_graph_ent_entry *entry; + int size; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, - sizeof(*entry), trace_ctx); + /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */ + size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long)); + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, size, trace_ctx); if (!event) return 0; - entry = ring_buffer_event_data(event); - entry->graph_ent = *trace; + + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + +#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API + if (fregs) { + for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++) + entry->args[i] = ftrace_regs_get_argument(fregs, i); + } +#endif + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } +int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned int trace_ctx) +{ + return __graph_entry(tr, trace, trace_ctx, NULL); +} + #ifdef CONFIG_FUNCTION_GRAPH_RETADDR int __trace_graph_retaddr_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, @@ -174,9 +196,9 @@ struct fgraph_times { unsigned long long sleeptime; /* may be optional! */ }; -int trace_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops, - struct ftrace_regs *fregs) +static int graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; @@ -198,7 +220,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, * returning from the function. */ if (ftrace_graph_notrace_addr(trace->func)) { - *task_var |= TRACE_GRAPH_NOTRACE_BIT; + *task_var |= TRACE_GRAPH_NOTRACE; /* * Need to return 1 to have the return called * that will clear the NOTRACE bit. @@ -246,7 +268,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, unsigned long retaddr = ftrace_graph_top_ret_addr(current); ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); } else { - ret = __trace_graph_entry(tr, trace, trace_ctx); + ret = __graph_entry(tr, trace, trace_ctx, fregs); } } preempt_enable_notrace(); @@ -254,6 +276,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, return ret; } +int trace_graph_entry(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + return graph_entry(trace, gops, NULL); +} + +static int trace_graph_entry_args(struct ftrace_graph_ent *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + return graph_entry(trace, gops, fregs); +} + static void __trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned int trace_ctx) @@ -418,14 +454,17 @@ static int graph_trace_init(struct trace_array *tr) { int ret; - tr->gops->entryfunc = trace_graph_entry; + if (tracer_flags_is_set(TRACE_GRAPH_ARGS)) + tr->gops->entryfunc = trace_graph_entry_args; + else + tr->gops->entryfunc = trace_graph_entry; if (tracing_thresh) tr->gops->retfunc = trace_graph_thresh_return; else tr->gops->retfunc = trace_graph_return; - /* Make gops functions are visible before we start tracing */ + /* Make gops functions visible before we start tracing */ smp_mb(); ret = register_ftrace_graph(tr->gops); @@ -436,6 +475,28 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +static int ftrace_graph_trace_args(struct trace_array *tr, int set) +{ + trace_func_graph_ent_t entry; + + if (set) + entry = trace_graph_entry_args; + else + entry = trace_graph_entry; + + /* See if there's any changes */ + if (tr->gops->entryfunc == entry) + return 0; + + unregister_ftrace_graph(tr->gops); + + tr->gops->entryfunc = entry; + + /* Make gops functions visible before we start tracing */ + smp_mb(); + return register_ftrace_graph(tr->gops); +} + static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); @@ -775,7 +836,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry, struct ftrace_graph_ret *graph_ret, void *func, - u32 opt_flags, u32 trace_flags) + u32 opt_flags, u32 trace_flags, int args_size) { unsigned long err_code = 0; unsigned long retval = 0; @@ -809,7 +870,14 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT) print_retaddr = false; - trace_seq_printf(s, "%ps();", func); + trace_seq_printf(s, "%ps", func); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { + print_function_args(s, entry->args, (unsigned long)func); + trace_seq_putc(s, ';'); + } else + trace_seq_puts(s, "();"); + if (print_retval || print_retaddr) trace_seq_puts(s, " /*"); else @@ -836,7 +904,8 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr #else -#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0) +#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags, args_size) \ + do {} while (0) #endif @@ -852,16 +921,17 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; - unsigned long func; + unsigned long ret_func; + int args_size; int cpu = iter->cpu; int i; + args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); + graph_ret = &ret_entry->ret; call = &entry->graph_ent; duration = ret_entry->rettime - ret_entry->calltime; - func = call->func + iter->tr->text_delta; - if (data) { struct fgraph_cpu_data *cpu_data; @@ -887,16 +957,25 @@ print_graph_entry_leaf(struct trace_iterator *iter, for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) trace_seq_putc(s, ' '); + ret_func = graph_ret->func + iter->tr->text_delta; + /* * Write out the function return value or return address */ if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) { print_graph_retval(s, entry, graph_ret, (void *)graph_ret->func + iter->tr->text_delta, - flags, tr->trace_flags); + flags, tr->trace_flags, args_size); } else { - trace_seq_printf(s, "%ps();\n", (void *)func); + trace_seq_printf(s, "%ps", (void *)ret_func); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) { + print_function_args(s, entry->args, ret_func); + trace_seq_putc(s, ';'); + } else + trace_seq_puts(s, "();"); } + trace_seq_printf(s, "\n"); print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET, cpu, iter->ent->pid, flags); @@ -913,6 +992,7 @@ print_graph_entry_nested(struct trace_iterator *iter, struct fgraph_data *data = iter->private; struct trace_array *tr = iter->tr; unsigned long func; + int args_size; int i; if (data) { @@ -937,7 +1017,17 @@ print_graph_entry_nested(struct trace_iterator *iter, func = call->func + iter->tr->text_delta; - trace_seq_printf(s, "%ps() {", (void *)func); + trace_seq_printf(s, "%ps", (void *)func); + + args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args); + + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) + print_function_args(s, entry->args, func); + else + trace_seq_puts(s, "()"); + + trace_seq_puts(s, " {"); + if (flags & __TRACE_GRAPH_PRINT_RETADDR && entry->ent.type == TRACE_GRAPH_RETADDR_ENT) print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry, @@ -1107,21 +1197,38 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter, u32 flags) { struct fgraph_data *data = iter->private; - struct ftrace_graph_ent *call = &field->graph_ent; + struct ftrace_graph_ent *call; struct ftrace_graph_ret_entry *leaf_ret; static enum print_line_t ret; int cpu = iter->cpu; + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry *entry; + u8 save_buf[sizeof(*entry) + FTRACE_REGS_MAX_ARGS * sizeof(long)]; + + /* The ent_size is expected to be as big as the entry */ + if (iter->ent_size > sizeof(save_buf)) + iter->ent_size = sizeof(save_buf); + + entry = (void *)save_buf; + memcpy(entry, field, iter->ent_size); + + call = &entry->graph_ent; if (check_irq_entry(iter, flags, call->func, call->depth)) return TRACE_TYPE_HANDLED; print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags); - leaf_ret = get_return_for_leaf(iter, field); + leaf_ret = get_return_for_leaf(iter, entry); if (leaf_ret) - ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags); + ret = print_graph_entry_leaf(iter, entry, leaf_ret, s, flags); else - ret = print_graph_entry_nested(iter, field, s, cpu, flags); + ret = print_graph_entry_nested(iter, entry, s, cpu, flags); if (data) { /* @@ -1195,7 +1302,8 @@ print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s, * funcgraph-retval option is enabled. */ if (flags & __TRACE_GRAPH_PRINT_RETVAL) { - print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags); + print_graph_retval(s, NULL, trace, (void *)func, flags, + tr->trace_flags, 0); } else { /* * If the return function does not have a matching entry, @@ -1323,16 +1431,8 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) switch (entry->type) { case TRACE_GRAPH_ENT: { - /* - * print_graph_entry() may consume the current event, - * thus @field may become invalid, so we need to save it. - * sizeof(struct ftrace_graph_ent_entry) is very small, - * it can be safely saved at the stack. - */ - struct ftrace_graph_ent_entry saved; trace_assign_type(field, entry); - saved = *field; - return print_graph_entry(&saved, s, iter, flags); + return print_graph_entry(field, s, iter, flags); } #ifdef CONFIG_FUNCTION_GRAPH_RETADDR case TRACE_GRAPH_RETADDR_ENT: { @@ -1511,6 +1611,7 @@ void graph_trace_close(struct trace_iterator *iter) if (data) { free_percpu(data->cpu_data); kfree(data); + iter->private = NULL; } } @@ -1526,6 +1627,9 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_GRAPH_GRAPH_TIME) ftrace_graph_graph_time_control(set); + if (bit == TRACE_GRAPH_ARGS) + return ftrace_graph_trace_args(tr, set); + return 0; } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 7294ad676379..40c39e946940 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -150,7 +150,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, trace_ctx = tracing_gen_ctx_flags(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); atomic_dec(&data->disabled); } @@ -250,8 +250,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - else - iter->private = NULL; } static void irqsoff_trace_close(struct trace_iterator *iter) @@ -295,11 +293,17 @@ __trace_function(struct trace_array *tr, if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); } #else -#define __trace_function trace_function +static inline void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned int trace_ctx) +{ + return trace_function(tr, ip, parent_ip, trace_ctx, NULL); +} static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d8d5f18a141a..2703b96d8990 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -124,9 +124,8 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk) if (!p) return true; *p = '\0'; - rcu_read_lock_sched(); - ret = !!find_module(tk->symbol); - rcu_read_unlock_sched(); + scoped_guard(rcu) + ret = !!find_module(tk->symbol); *p = ':'; return ret; @@ -796,12 +795,10 @@ static struct module *try_module_get_by_name(const char *name) { struct module *mod; - rcu_read_lock_sched(); + guard(rcu)(); mod = find_module(name); if (mod && !try_module_get(mod)) mod = NULL; - rcu_read_unlock_sched(); - return mod; } #else @@ -1007,8 +1004,11 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], argc = new_argc; argv = new_argv; } - if (argc > MAX_TRACE_ARGS) + if (argc > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index f3a2722ee4c0..e732c9e37e14 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -316,33 +316,6 @@ static inline void osn_var_reset_all(void) bool trace_osnoise_callback_enabled; /* - * osnoise sample structure definition. Used to store the statistics of a - * sample run. - */ -struct osnoise_sample { - u64 runtime; /* runtime */ - u64 noise; /* noise */ - u64 max_sample; /* max single noise sample */ - int hw_count; /* # HW (incl. hypervisor) interference */ - int nmi_count; /* # NMIs during this sample */ - int irq_count; /* # IRQs during this sample */ - int softirq_count; /* # softirqs during this sample */ - int thread_count; /* # threads during this sample */ -}; - -#ifdef CONFIG_TIMERLAT_TRACER -/* - * timerlat sample structure definition. Used to store the statistics of - * a sample run. - */ -struct timerlat_sample { - u64 timer_latency; /* timer_latency */ - unsigned int seqnum; /* unique sequence */ - int context; /* timer context */ -}; -#endif - -/* * Tracer data. */ static struct osnoise_data { @@ -497,7 +470,7 @@ static void print_osnoise_headers(struct seq_file *s) * Record an osnoise_sample into the tracer buffer. */ static void -__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) +__record_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer) { struct ring_buffer_event *event; struct osnoise_entry *entry; @@ -520,17 +493,19 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe } /* - * Record an osnoise_sample on all osnoise instances. + * Record an osnoise_sample on all osnoise instances and fire trace event. */ -static void trace_osnoise_sample(struct osnoise_sample *sample) +static void record_osnoise_sample(struct osnoise_sample *sample) { struct osnoise_instance *inst; struct trace_buffer *buffer; + trace_osnoise_sample(sample); + rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { buffer = inst->tr->array_buffer.buffer; - __trace_osnoise_sample(sample, buffer); + __record_osnoise_sample(sample, buffer); } rcu_read_unlock(); } @@ -574,7 +549,7 @@ static void print_timerlat_headers(struct seq_file *s) #endif /* CONFIG_PREEMPT_RT */ static void -__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) +__record_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer) { struct ring_buffer_event *event; struct timerlat_entry *entry; @@ -594,15 +569,17 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf /* * Record an timerlat_sample into the tracer buffer. */ -static void trace_timerlat_sample(struct timerlat_sample *sample) +static void record_timerlat_sample(struct timerlat_sample *sample) { struct osnoise_instance *inst; struct trace_buffer *buffer; + trace_timerlat_sample(sample); + rcu_read_lock(); list_for_each_entry_rcu(inst, &osnoise_instances, list) { buffer = inst->tr->array_buffer.buffer; - __trace_timerlat_sample(sample, buffer); + __record_timerlat_sample(sample, buffer); } rcu_read_unlock(); } @@ -1542,27 +1519,25 @@ static int run_osnoise(void) /* * In some cases, notably when running on a nohz_full CPU with - * a stopped tick PREEMPT_RCU has no way to account for QSs. - * This will eventually cause unwarranted noise as PREEMPT_RCU - * will force preemption as the means of ending the current - * grace period. We avoid this problem by calling - * rcu_momentary_eqs(), which performs a zero duration - * EQS allowing PREEMPT_RCU to end the current grace period. - * This call shouldn't be wrapped inside an RCU critical - * section. + * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to + * account for QSs. This will eventually cause unwarranted + * noise as RCU forces preemption as the means of ending the + * current grace period. We avoid this by calling + * rcu_momentary_eqs(), which performs a zero duration EQS + * allowing RCU to end the current grace period. This call + * shouldn't be wrapped inside an RCU critical section. * - * Note that in non PREEMPT_RCU kernels QSs are handled through - * cond_resched() + * Normally QSs for other cases are handled through cond_resched(). + * For simplicity, however, we call rcu_momentary_eqs() for all + * configurations here. */ - if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { - if (!disable_irq) - local_irq_disable(); + if (!disable_irq) + local_irq_disable(); - rcu_momentary_eqs(); + rcu_momentary_eqs(); - if (!disable_irq) - local_irq_enable(); - } + if (!disable_irq) + local_irq_enable(); /* * For the non-preemptive kernel config: let threads runs, if @@ -1608,7 +1583,7 @@ static int run_osnoise(void) /* Save interference stats info */ diff_osn_sample_stats(osn_var, &s); - trace_osnoise_sample(&s); + record_osnoise_sample(&s); notify_new_max_latency(max_noise); @@ -1803,7 +1778,7 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer) s.timer_latency = diff; s.context = IRQ_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); if (osnoise_data.stop_tracing) { if (time_to_us(diff) >= osnoise_data.stop_tracing) { @@ -1901,8 +1876,7 @@ static int timerlat_main(void *data) tlat->count = 0; tlat->tracing_thread = false; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); tlat->kthread = current; osn_var->pid = current->pid; /* @@ -1923,7 +1897,7 @@ static int timerlat_main(void *data) s.timer_latency = diff; s.context = THREAD_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); notify_new_max_latency(diff); @@ -2032,7 +2006,6 @@ static int start_kthread(unsigned int cpu) if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); - stop_per_cpu_kthreads(); return -ENOMEM; } @@ -2456,8 +2429,7 @@ static int timerlat_fd_open(struct inode *inode, struct file *file) tlat = this_cpu_tmr_var(); tlat->count = 0; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); migrate_enable(); return 0; @@ -2529,7 +2501,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, s.timer_latency = diff; s.context = THREAD_URET; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); notify_new_max_latency(diff); @@ -2564,7 +2536,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count, s.timer_latency = diff; s.context = THREAD_CONTEXT; - trace_timerlat_sample(&s); + record_timerlat_sample(&s); if (osnoise_data.stop_tracing_total) { if (time_to_us(diff) >= osnoise_data.stop_tracing_total) { diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 03d56f711ad1..72b699f909e8 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -12,15 +12,19 @@ #include <linux/sched/clock.h> #include <linux/sched/mm.h> #include <linux/idr.h> +#include <linux/btf.h> +#include <linux/bpf.h> +#include <linux/hashtable.h> #include "trace_output.h" +#include "trace_btf.h" -/* must be a power of 2 */ -#define EVENT_HASHSIZE 128 +/* 2^7 = 128 */ +#define EVENT_HASH_BITS 7 DECLARE_RWSEM(trace_event_sem); -static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; +static DEFINE_HASHTABLE(event_hash, EVENT_HASH_BITS); enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) { @@ -684,6 +688,88 @@ int trace_print_lat_context(struct trace_iterator *iter) return !trace_seq_has_overflowed(s); } +#ifdef CONFIG_FUNCTION_TRACE_ARGS +void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func) +{ + const struct btf_param *param; + const struct btf_type *t; + const char *param_name; + char name[KSYM_NAME_LEN]; + unsigned long arg; + struct btf *btf; + s32 tid, nr = 0; + int a, p, x; + + trace_seq_printf(s, "("); + + if (!args) + goto out; + if (lookup_symbol_name(func, name)) + goto out; + + /* TODO: Pass module name here too */ + t = btf_find_func_proto(name, &btf); + if (IS_ERR_OR_NULL(t)) + goto out; + + param = btf_get_func_param(t, &nr); + if (!param) + goto out_put; + + for (a = 0, p = 0; p < nr; a++, p++) { + if (p) + trace_seq_puts(s, ", "); + + /* This only prints what the arch allows (6 args by default) */ + if (a == FTRACE_REGS_MAX_ARGS) { + trace_seq_puts(s, "..."); + break; + } + + arg = args[a]; + + param_name = btf_name_by_offset(btf, param[p].name_off); + if (param_name) + trace_seq_printf(s, "%s=", param_name); + t = btf_type_skip_modifiers(btf, param[p].type, &tid); + + switch (t ? BTF_INFO_KIND(t->info) : BTF_KIND_UNKN) { + case BTF_KIND_UNKN: + trace_seq_putc(s, '?'); + /* Still print unknown type values */ + fallthrough; + case BTF_KIND_PTR: + trace_seq_printf(s, "0x%lx", arg); + break; + case BTF_KIND_INT: + trace_seq_printf(s, "%ld", arg); + break; + case BTF_KIND_ENUM: + trace_seq_printf(s, "%ld", arg); + break; + default: + /* This does not handle complex arguments */ + trace_seq_printf(s, "(%s)[0x%lx", btf_type_str(t), arg); + for (x = sizeof(long); x < t->size; x += sizeof(long)) { + trace_seq_putc(s, ':'); + if (++a == FTRACE_REGS_MAX_ARGS) { + trace_seq_puts(s, "...]"); + goto out_put; + } + trace_seq_printf(s, "0x%lx", args[a]); + } + trace_seq_putc(s, ']'); + break; + } + } +out_put: + btf_put(btf); +out: + trace_seq_printf(s, ")"); +} +#endif + /** * ftrace_find_event - find a registered event * @type: the type of event to look for @@ -694,11 +780,8 @@ int trace_print_lat_context(struct trace_iterator *iter) struct trace_event *ftrace_find_event(int type) { struct trace_event *event; - unsigned key; - - key = type & (EVENT_HASHSIZE - 1); - hlist_for_each_entry(event, &event_hash[key], node) { + hash_for_each_possible(event_hash, event, node, type) { if (event->type == type) return event; } @@ -753,7 +836,6 @@ void trace_event_read_unlock(void) */ int register_trace_event(struct trace_event *event) { - unsigned key; int ret = 0; down_write(&trace_event_sem); @@ -786,9 +868,7 @@ int register_trace_event(struct trace_event *event) if (event->funcs->binary == NULL) event->funcs->binary = trace_nop_print; - key = event->type & (EVENT_HASHSIZE - 1); - - hlist_add_head(&event->node, &event_hash[key]); + hash_add(event_hash, &event->node, event->type); ret = event->type; out: @@ -803,7 +883,7 @@ EXPORT_SYMBOL_GPL(register_trace_event); */ int __unregister_trace_event(struct trace_event *event) { - hlist_del(&event->node); + hash_del(&event->node); free_trace_event_type(event->type); return 0; } @@ -1005,12 +1085,15 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, } static void print_fn_trace(struct trace_seq *s, unsigned long ip, - unsigned long parent_ip, long delta, int flags) + unsigned long parent_ip, long delta, + unsigned long *args, int flags) { ip += delta; parent_ip += delta; seq_print_ip_sym(s, ip, flags); + if (args) + print_function_args(s, args, ip); if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { trace_seq_puts(s, " <-"); @@ -1024,10 +1107,19 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; + unsigned long *args; + int args_size; trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); + args_size = iter->ent_size - offsetof(struct ftrace_entry, args); + if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) + args = field->args; + else + args = NULL; + + print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, + args, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1700,7 +1792,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags); + print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, NULL, flags); trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); trace_print_time(s, iter, iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index dca40f1f1da4..2e305364f2a9 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -41,5 +41,14 @@ extern struct rw_semaphore trace_event_sem; #define SEQ_PUT_HEX_FIELD(s, x) \ trace_seq_putmem_hex(s, &(x), sizeof(x)) +#ifdef CONFIG_FUNCTION_TRACE_ARGS +void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func); +#else +static inline void print_function_args(struct trace_seq *s, unsigned long *args, + unsigned long func) { + trace_seq_puts(s, "()"); +} +#endif #endif diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8f58ee1e8858..2eeecb6c95ee 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -770,6 +770,10 @@ static int check_prepare_btf_string_fetch(char *typename, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +/* + * Add the entry code to store the 'argnum'th parameter and return the offset + * in the entry data buffer where the data will be stored. + */ static int __store_entry_arg(struct trace_probe *tp, int argnum) { struct probe_entry_arg *earg = tp->entry_arg; @@ -793,6 +797,20 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum) tp->entry_arg = earg; } + /* + * The entry code array is repeating the pair of + * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)] + * and the rest of entries are filled with [FETCH_OP_END]. + * + * To reduce the redundant function parameter fetching, we scan the entry + * code array to find the FETCH_OP_ARG which already fetches the 'argnum' + * parameter. If it doesn't match, update 'offset' to find the last + * offset. + * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we + * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and + * return data offset so that caller can find the data offset in the entry + * data buffer. + */ offset = 0; for (i = 0; i < earg->size - 1; i++) { switch (earg->code[i].op) { @@ -826,6 +844,16 @@ int traceprobe_get_entry_data_size(struct trace_probe *tp) if (!earg) return 0; + /* + * earg->code[] array has an operation sequence which is run in + * the entry handler. + * The sequence stopped by FETCH_OP_END and each data stored in + * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA + * stores the data at the data buffer + its offset, and all data are + * "unsigned long" size. The offset must be increased when a data is + * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the + * code array. + */ for (i = 0; i < earg->size; i++) { switch (earg->code[i].op) { case FETCH_OP_END: diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5803e6a41570..854e5668f5ee 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -36,7 +36,6 @@ #define MAX_BTF_ARGS_LEN 128 #define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX -#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -481,6 +480,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ C(NO_TRACEPOINT, "Tracepoint is not found"), \ + C(BAD_TP_NAME, "Invalid character in tracepoint name"),\ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ @@ -544,7 +544,9 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_BTF_FIELD, "This field is not found."), \ C(BAD_BTF_TID, "Failed to get BTF type info."),\ C(BAD_TYPE4STR, "This type does not fit for string."),\ - C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"), + C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ + C(TOO_MANY_ARGS, "Too many arguments are specified"), \ + C(TOO_MANY_EARGS, "Too many entry arguments specified"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index af30586f1aea..a0db3404f7f7 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -188,8 +188,6 @@ static void wakeup_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - else - iter->private = NULL; } static void wakeup_trace_close(struct trace_iterator *iter) @@ -242,7 +240,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, return; local_irq_save(flags); - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, fregs); local_irq_restore(flags); atomic_dec(&data->disabled); @@ -327,7 +325,7 @@ __trace_function(struct trace_array *tr, if (is_graph(tr)) trace_graph_function(tr, ip, parent_ip, trace_ctx); else - trace_function(tr, ip, parent_ip, trace_ctx); + trace_function(tr, ip, parent_ip, trace_ctx, NULL); } static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index ccc762fbb69c..3386439ec9f6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -562,8 +562,14 @@ static int __trace_uprobe_create(int argc, const char **argv) if (argc < 2) return -ECANCELED; - if (argc - 2 > MAX_TRACE_ARGS) + + trace_probe_log_init("trace_uprobe", argc, argv); + + if (argc - 2 > MAX_TRACE_ARGS) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_ARGS); return -E2BIG; + } if (argv[0][1] == ':') event = &argv[0][2]; @@ -582,7 +588,6 @@ static int __trace_uprobe_create(int argc, const char **argv) return -ECANCELED; } - trace_probe_log_init("trace_uprobe", argc, argv); trace_probe_log_set_index(1); /* filename is the 2nd argument */ *arg++ = '\0'; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1848ce7e2976..62719d2941c9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -127,7 +127,7 @@ static void debug_print_probes(struct tracepoint_func *funcs) return; for (i = 0; funcs[i].func; i++) - printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func); + printk(KERN_DEBUG "Probe %d : %pSb\n", i, funcs[i].func); } static struct tracepoint_func * diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index aa0b2e47f2f2..682f40d5632d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -238,7 +238,7 @@ EXPORT_SYMBOL(__put_user_ns); struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ - u32 count; /* == 0 unless used with map_id_range_down() */ + u32 count; }; /* @@ -343,16 +343,19 @@ u32 map_id_down(struct uid_gid_map *map, u32 id) * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * -map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) +map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; - u32 first, last; + u32 first, last, id2; + + id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; - if (id >= first && id <= last) + if (id >= first && id <= last && + (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; @@ -363,28 +366,28 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * -map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) +map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = true; - key.count = 1; + key.count = count; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } -u32 map_id_up(struct uid_gid_map *map, u32 id) +u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - extent = map_id_up_base(extents, map, id); + extent = map_id_range_up_base(extents, map, id, count); else - extent = map_id_up_max(extents, map, id); + extent = map_id_range_up_max(extents, map, id, count); /* Map the id or note failure */ if (extent) @@ -395,6 +398,11 @@ u32 map_id_up(struct uid_gid_map *map, u32 id) return id; } +u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + return map_id_range_up(map, id, 1); +} + /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 8800f5acc007..2ef2e1b80091 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -133,7 +133,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL); if (!vtsk) - return NULL; + return ERR_PTR(-ENOMEM); init_completion(&vtsk->exited); mutex_init(&vtsk->exit_mutex); vtsk->data = arg; @@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); if (IS_ERR(tsk)) { kfree(vtsk); - return NULL; + return ERR_PTR(PTR_ERR(tsk)); } vtsk->task = tsk; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 5267adeaa403..7e45559521af 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -101,12 +101,11 @@ static bool post_one_notification(struct watch_queue *wqueue, struct pipe_inode_info *pipe = wqueue->pipe; struct pipe_buffer *buf; struct page *page; - unsigned int head, tail, mask, note, offset, len; + unsigned int head, tail, note, offset, len; bool done = false; spin_lock_irq(&pipe->rd_wait.lock); - mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; if (pipe_full(head, tail, pipe->ring_size)) @@ -124,7 +123,7 @@ static bool post_one_notification(struct watch_queue *wqueue, memcpy(p + offset, n, len); kunmap_atomic(p); - buf = &pipe->bufs[head & mask]; + buf = pipe_buf(pipe, head); buf->page = page; buf->private = (unsigned long)wqueue; buf->ops = &watch_queue_pipe_buf_ops; @@ -147,7 +146,7 @@ out: return done; lost: - buf = &pipe->bufs[(head - 1) & mask]; + buf = pipe_buf(pipe, head - 1); buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } @@ -269,6 +268,15 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) if (ret < 0) goto error; + /* + * pipe_resize_ring() does not update nr_accounted for watch_queue + * pipes, because the above vastly overprovisions. Set nr_accounted on + * and max_usage this pipe to the number that was actually charged to + * the user above via account_pipe_buffers. + */ + pipe->max_usage = nr_pages; + pipe->nr_accounted = nr_pages; + ret = -ENOMEM; pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b2da7de39d06..9fa2af9dbf2c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -347,8 +347,6 @@ static int __init watchdog_thresh_setup(char *str) } __setup("watchdog_thresh=", watchdog_thresh_setup); -static void __lockup_detector_cleanup(void); - #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM enum stats_per_group { STATS_SYSTEM, @@ -797,8 +795,7 @@ static void watchdog_enable(unsigned int cpu) * Start the timer first to prevent the hardlockup watchdog triggering * before the timer has a chance to fire. */ - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - hrtimer->function = watchdog_timer_fn; + hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED_HARD); @@ -886,11 +883,6 @@ static void __lockup_detector_reconfigure(void) watchdog_hardlockup_start(); cpus_read_unlock(); - /* - * Must be called outside the cpus locked section to prevent - * recursive locking in the perf code. - */ - __lockup_detector_cleanup(); } void lockup_detector_reconfigure(void) @@ -940,24 +932,6 @@ static inline void lockup_detector_setup(void) } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -static void __lockup_detector_cleanup(void) -{ - lockdep_assert_held(&watchdog_mutex); - hardlockup_detector_perf_cleanup(); -} - -/** - * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes - * - * Caller must not hold the cpu hotplug rwsem. - */ -void lockup_detector_cleanup(void) -{ - mutex_lock(&watchdog_mutex); - __lockup_detector_cleanup(); - mutex_unlock(&watchdog_mutex); -} - /** * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) * diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 59c1d86a73a2..a78ff092d636 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -21,8 +21,6 @@ #include <linux/perf_event.h> static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -static DEFINE_PER_CPU(struct perf_event *, dead_event); -static struct cpumask dead_events_mask; static atomic_t watchdog_cpus = ATOMIC_INIT(0); @@ -146,6 +144,7 @@ static int hardlockup_detector_event_create(void) PTR_ERR(evt)); return PTR_ERR(evt); } + WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak"); this_cpu_write(watchdog_ev, evt); return 0; } @@ -181,37 +180,13 @@ void watchdog_hardlockup_disable(unsigned int cpu) if (event) { perf_event_disable(event); + perf_event_release_kernel(event); this_cpu_write(watchdog_ev, NULL); - this_cpu_write(dead_event, event); - cpumask_set_cpu(smp_processor_id(), &dead_events_mask); atomic_dec(&watchdog_cpus); } } /** - * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them - * - * Called from lockup_detector_cleanup(). Serialized by the caller. - */ -void hardlockup_detector_perf_cleanup(void) -{ - int cpu; - - for_each_cpu(cpu, &dead_events_mask) { - struct perf_event *event = per_cpu(dead_event, cpu); - - /* - * Required because for_each_cpu() reports unconditionally - * CPU0 as set on UP kernels. Sigh. - */ - if (event) - perf_event_release_kernel(event); - per_cpu(dead_event, cpu) = NULL; - } - cpumask_clear(&dead_events_mask); -} - -/** * hardlockup_detector_perf_stop - Globally stop watchdog events * * Special interface for x86 to handle the perf HT bug. diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3c2c45313c88..bfe030b443e2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2254,8 +2254,10 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * queues a new work item to a wq after destroy_workqueue(wq). */ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && - WARN_ON_ONCE(!is_chained_work(wq)))) + WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n", + work->func, wq->name))) { return; + } rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ @@ -3517,12 +3519,6 @@ repeat: } /* - * Put the reference grabbed by send_mayday(). @pool won't - * go away while we're still attached to it. - */ - put_pwq(pwq); - - /* * Leave this pool. Notify regular workers; otherwise, we end up * with 0 concurrency and stalling the execution. */ @@ -3532,6 +3528,12 @@ repeat: worker_detach_from_pool(rescuer); + /* + * Put the reference grabbed by send_mayday(). @pool might + * go away any time after it. + */ + put_pwq_unlocked(pwq); + raw_spin_lock_irq(&wq_mayday_lock); } |