diff options
Diffstat (limited to 'kernel')
243 files changed, 11245 insertions, 7456 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 87866b037fbe..434929de17ef 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,6 +21,11 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_context_tracking.o += -DDISABLE_BRANCH_PROFILING +endif + # Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() # in coverage traces. KCOV_INSTRUMENT_softirq.o := n diff --git a/kernel/acct.c b/kernel/acct.c index 179848ad33e9..6520baa13669 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -76,7 +76,7 @@ static int acct_parm[3] = {4, 2, 30}; #define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */ #ifdef CONFIG_SYSCTL -static struct ctl_table kern_acct_table[] = { +static const struct ctl_table kern_acct_table[] = { { .procname = "acct", .data = &acct_parm, @@ -103,48 +103,50 @@ struct bsd_acct_struct { atomic_long_t count; struct rcu_head rcu; struct mutex lock; - int active; + bool active; + bool check_space; unsigned long needcheck; struct file *file; struct pid_namespace *ns; struct work_struct work; struct completion done; + acct_t ac; }; -static void do_acct_process(struct bsd_acct_struct *acct); +static void fill_ac(struct bsd_acct_struct *acct); +static void acct_write_process(struct bsd_acct_struct *acct); /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct bsd_acct_struct *acct) +static bool check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - if (time_is_after_jiffies(acct->needcheck)) - goto out; + if (!acct->check_space) + return acct->active; /* May block */ if (vfs_statfs(&acct->file->f_path, &sbuf)) - goto out; + return acct->active; if (acct->active) { u64 suspend = sbuf.f_blocks * SUSPEND; do_div(suspend, 100); if (sbuf.f_bavail <= suspend) { - acct->active = 0; + acct->active = false; pr_info("Process accounting paused\n"); } } else { u64 resume = sbuf.f_blocks * RESUME; do_div(resume, 100); if (sbuf.f_bavail >= resume) { - acct->active = 1; + acct->active = true; pr_info("Process accounting resumed\n"); } } acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; -out: return acct->active; } @@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin) { struct bsd_acct_struct *acct = to_acct(pin); mutex_lock(&acct->lock); - do_acct_process(acct); + /* + * Fill the accounting struct with the exiting task's info + * before punting to the workqueue. + */ + fill_ac(acct); schedule_work(&acct->work); wait_for_completion(&acct->done); cmpxchg(&acct->ns->bacct, pin, NULL); @@ -202,6 +208,9 @@ static void close_work(struct work_struct *work) { struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); struct file *file = acct->file; + + /* We were fired by acct_pin_kill() which holds acct->lock. */ + acct_write_process(acct); if (file->f_op->flush) file->f_op->flush(file, NULL); __fput_sync(file); @@ -234,6 +243,20 @@ static int acct_on(struct filename *pathname) return -EACCES; } + /* Exclude kernel kernel internal filesystems. */ + if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) { + kfree(acct); + filp_close(file, NULL); + return -EINVAL; + } + + /* Exclude procfs and sysfs. */ + if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) { + kfree(acct); + filp_close(file, NULL); + return -EINVAL; + } + if (!(file->f_mode & FMODE_CAN_WRITE)) { kfree(acct); filp_close(file, NULL); @@ -430,13 +453,27 @@ static u32 encode_float(u64 value) * do_exit() or when switching to a different output file. */ -static void fill_ac(acct_t *ac) +static void fill_ac(struct bsd_acct_struct *acct) { struct pacct_struct *pacct = ¤t->signal->pacct; + struct file *file = acct->file; + acct_t *ac = &acct->ac; u64 elapsed, run_time; time64_t btime; struct tty_struct *tty; + lockdep_assert_held(&acct->lock); + + if (time_is_after_jiffies(acct->needcheck)) { + acct->check_space = false; + + /* Don't fill in @ac if nothing will be written. */ + if (!acct->active) + return; + } else { + acct->check_space = true; + } + /* * Fill the accounting struct with the needed info as recorded * by the different kernel functions. @@ -484,64 +521,61 @@ static void fill_ac(acct_t *ac) ac->ac_majflt = encode_comp_t(pacct->ac_majflt); ac->ac_exitcode = pacct->ac_exitcode; spin_unlock_irq(¤t->sighand->siglock); -} -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct) -{ - acct_t ac; - unsigned long flim; - const struct cred *orig_cred; - struct file *file = acct->file; - /* - * Accounting records are not subject to resource limits. - */ - flim = rlimit(RLIMIT_FSIZE); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - /* Perform file operations on behalf of whoever enabled accounting */ - orig_cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct)) - goto out; - - fill_ac(&ac); /* we really need to bite the bullet and change layout */ - ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); - ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); + ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid()); + ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid()); #if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ - ac.ac_uid16 = ac.ac_uid; - ac.ac_gid16 = ac.ac_gid; + ac->ac_uid16 = ac->ac_uid; + ac->ac_gid16 = ac->ac_gid; #elif ACCT_VERSION == 3 { struct pid_namespace *ns = acct->ns; - ac.ac_pid = task_tgid_nr_ns(current, ns); + ac->ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); - ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), - ns); + ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); rcu_read_unlock(); } #endif +} + +static void acct_write_process(struct bsd_acct_struct *acct) +{ + struct file *file = acct->file; + const struct cred *cred; + acct_t *ac = &acct->ac; + + /* Perform file operations on behalf of whoever enabled accounting */ + cred = override_creds(file->f_cred); + /* - * Get freeze protection. If the fs is frozen, just skip the write - * as we could deadlock the system otherwise. + * First check to see if there is enough free_space to continue + * the process accounting system. Then get freeze protection. If + * the fs is frozen, just skip the write as we could deadlock + * the system otherwise. */ - if (file_start_write_trylock(file)) { + if (check_free_space(acct) && file_start_write_trylock(file)) { /* it's been opened O_APPEND, so position is irrelevant */ loff_t pos = 0; - __kernel_write(file, &ac, sizeof(acct_t), &pos); + __kernel_write(file, ac, sizeof(acct_t), &pos); file_end_write(file); } -out: + + revert_creds(cred); +} + +static void do_acct_process(struct bsd_acct_struct *acct) +{ + unsigned long flim; + + /* Accounting records are not subject to resource limits. */ + flim = rlimit(RLIMIT_FSIZE); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + fill_ac(acct); + acct_write_process(acct); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - revert_creds(orig_cred); } /** diff --git a/kernel/audit.c b/kernel/audit.c index 6a95a6077953..5f5bf85bcc90 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1221,8 +1221,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; struct audit_sig_info *sig_data; - char *ctx = NULL; - u32 len; + struct lsm_context lsmctx = { NULL, 0, 0 }; err = audit_netlink_ok(skb, msg_type); if (err) @@ -1472,27 +1471,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, break; } case AUDIT_SIGNAL_INFO: - len = 0; if (lsmprop_is_set(&audit_sig_lsm)) { - err = security_lsmprop_to_secctx(&audit_sig_lsm, &ctx, - &len); - if (err) + err = security_lsmprop_to_secctx(&audit_sig_lsm, + &lsmctx); + if (err < 0) return err; } - sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL); + sig_data = kmalloc(struct_size(sig_data, ctx, lsmctx.len), + GFP_KERNEL); if (!sig_data) { if (lsmprop_is_set(&audit_sig_lsm)) - security_release_secctx(ctx, len); + security_release_secctx(&lsmctx); return -ENOMEM; } sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; if (lsmprop_is_set(&audit_sig_lsm)) { - memcpy(sig_data->ctx, ctx, len); - security_release_secctx(ctx, len); + memcpy(sig_data->ctx, lsmctx.context, lsmctx.len); + security_release_secctx(&lsmctx); } audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, - sig_data, struct_size(sig_data, ctx, len)); + sig_data, struct_size(sig_data, ctx, + lsmctx.len)); kfree(sig_data); break; case AUDIT_TTY_GET: { @@ -2180,23 +2180,22 @@ void audit_log_key(struct audit_buffer *ab, char *key) int audit_log_task_context(struct audit_buffer *ab) { struct lsm_prop prop; - char *ctx = NULL; - unsigned len; + struct lsm_context ctx; int error; security_current_getlsmprop_subj(&prop); if (!lsmprop_is_set(&prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx, &len); - if (error) { + error = security_lsmprop_to_secctx(&prop, &ctx); + if (error < 0) { if (error != -EINVAL) goto error_path; return 0; } - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); + audit_log_format(ab, " subj=%s", ctx.context); + security_release_secctx(&ctx); return 0; error_path: diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 7f358740e958..367eaf2c78b7 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -350,11 +350,10 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - if (d_is_positive(d)) { - /* update watch filter fields */ - watch->dev = d->d_sb->s_dev; - watch->ino = d_backing_inode(d)->i_ino; - } + /* update watch filter fields */ + watch->dev = d->d_sb->s_dev; + watch->ino = d_backing_inode(d)->i_ino; + inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; @@ -419,10 +418,11 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - if (ret) { + if (ret && ret != -ENOENT) { audit_put_watch(watch); return ret; } + ret = 0; /* either find an old parent or attach a new one */ parent = audit_find_parent(d_backing_inode(parent_path.dentry)); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index bceb9f58a09e..e3f42018ed46 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1319,13 +1319,20 @@ int audit_compare_dname_path(const struct qstr *dname, const char *path, int par if (pathlen < dlen) return 1; - parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen; - if (pathlen - parentlen != dlen) - return 1; + if (parentlen == AUDIT_NAME_FULL) + parentlen = parent_len(path); p = path + parentlen; - return strncmp(p, dname->name, dlen); + /* handle trailing slashes */ + pathlen -= parentlen; + while (p[pathlen - 1] == '/') + pathlen--; + + if (pathlen != dlen) + return 1; + + return memcmp(p, dname->name, dlen); } int audit_filter(int msgtype, unsigned int listtype) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 561d96affe9f..78fd876a5473 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1098,8 +1098,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, char *comm) { struct audit_buffer *ab; - char *ctx = NULL; - u32 len; + struct lsm_context ctx; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); @@ -1110,12 +1109,12 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx, &len)) { + if (security_lsmprop_to_secctx(prop, &ctx) < 0) { audit_log_format(ab, " obj=(none)"); rc = 1; } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); + audit_log_format(ab, " obj=%s", ctx.context); + security_release_secctx(&ctx); } } audit_log_format(ab, " ocomm="); @@ -1393,15 +1392,14 @@ static void show_special(struct audit_context *context, int *call_panic) from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { - char *ctx = NULL; - u32 len; + struct lsm_context lsmctx; if (security_lsmprop_to_secctx(&context->ipc.oprop, - &ctx, &len)) { + &lsmctx) < 0) { *call_panic = 1; } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); + audit_log_format(ab, " obj=%s", lsmctx.context); + security_release_secctx(&lsmctx); } } if (context->ipc.has_perm) { @@ -1560,15 +1558,14 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, MAJOR(n->rdev), MINOR(n->rdev)); if (lsmprop_is_set(&n->oprop)) { - char *ctx = NULL; - u32 len; + struct lsm_context ctx; - if (security_lsmprop_to_secctx(&n->oprop, &ctx, &len)) { + if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) { if (call_panic) *call_panic = 2; } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); + audit_log_format(ab, " obj=%s", ctx.context); + security_release_secctx(&ctx); } } @@ -2210,10 +2207,8 @@ __audit_reusename(const __user char *uptr) list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; - if (n->name->uptr == uptr) { - atomic_inc(&n->name->refcnt); - return n->name; - } + if (n->name->uptr == uptr) + return refname(n->name); } return NULL; } @@ -2240,7 +2235,7 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; - atomic_inc(&name->refcnt); + refname(name); } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2372,7 +2367,7 @@ out_alloc: return; if (name) { n->name = name; - atomic_inc(&name->refcnt); + refname(name); } out: @@ -2499,7 +2494,7 @@ void __audit_inode_child(struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - atomic_inc(&found_child->name->refcnt); + refname(found_child->name); } } diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 945a5680f6a5..095a9554e1de 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -39,7 +39,7 @@ */ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ -#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8) +#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1) #define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { @@ -138,7 +138,11 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) INIT_LIST_HEAD(&arena->vma_list); bpf_map_init_from_attr(&arena->map, attr); range_tree_init(&arena->rt); - range_tree_set(&arena->rt, 0, attr->max_entries); + err = range_tree_set(&arena->rt, 0, attr->max_entries); + if (err) { + bpf_map_area_free(arena); + goto err; + } mutex_init(&arena->lock); return &arena->map; @@ -218,7 +222,7 @@ static u64 arena_map_mem_usage(const struct bpf_map *map) struct vma_list { struct vm_area_struct *vma; struct list_head head; - atomic_t mmap_count; + refcount_t mmap_count; }; static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) @@ -228,7 +232,7 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma) vml = kmalloc(sizeof(*vml), GFP_KERNEL); if (!vml) return -ENOMEM; - atomic_set(&vml->mmap_count, 1); + refcount_set(&vml->mmap_count, 1); vma->vm_private_data = vml; vml->vma = vma; list_add(&vml->head, &arena->vma_list); @@ -239,7 +243,7 @@ static void arena_vm_open(struct vm_area_struct *vma) { struct vma_list *vml = vma->vm_private_data; - atomic_inc(&vml->mmap_count); + refcount_inc(&vml->mmap_count); } static void arena_vm_close(struct vm_area_struct *vma) @@ -248,7 +252,7 @@ static void arena_vm_close(struct vm_area_struct *vma) struct bpf_arena *arena = container_of(map, struct bpf_arena, map); struct vma_list *vml = vma->vm_private_data; - if (!atomic_dec_and_test(&vml->mmap_count)) + if (!refcount_dec_and_test(&vml->mmap_count)) return; guard(mutex)(&arena->lock); /* update link list under lock */ @@ -257,8 +261,6 @@ static void arena_vm_close(struct vm_area_struct *vma) kfree(vml); } -#define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */ - static vm_fault_t arena_vm_fault(struct vm_fault *vmf) { struct bpf_map *map = vmf->vma->vm_file->private_data; @@ -443,7 +445,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt return 0; } - /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */ + /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */ pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL); if (!pages) return 0; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 6cdbb4c33d31..eb28c0f219ee 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -735,13 +735,13 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback u64 ret = 0; void *val; + cant_migrate(); + if (flags != 0) return -EINVAL; is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; array = container_of(map, struct bpf_array, map); - if (is_percpu) - migrate_disable(); for (i = 0; i < map->max_entries; i++) { if (is_percpu) val = this_cpu_ptr(array->pptrs[i]); @@ -756,8 +756,6 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback break; } - if (is_percpu) - migrate_enable(); return num_elems; } diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 20f05de92e9c..54ff2a85d4c0 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -15,22 +15,20 @@ static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); static void bpf_cgrp_storage_lock(void) { - migrate_disable(); + cant_migrate(); this_cpu_inc(bpf_cgrp_storage_busy); } static void bpf_cgrp_storage_unlock(void) { this_cpu_dec(bpf_cgrp_storage_busy); - migrate_enable(); } static bool bpf_cgrp_storage_trylock(void) { - migrate_disable(); + cant_migrate(); if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { this_cpu_dec(bpf_cgrp_storage_busy); - migrate_enable(); return false; } return true; @@ -47,17 +45,18 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; + migrate_disable(); rcu_read_lock(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); - if (!local_storage) { - rcu_read_unlock(); - return; - } + if (!local_storage) + goto out; bpf_cgrp_storage_lock(); bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); +out: rcu_read_unlock(); + migrate_enable(); } static struct bpf_local_storage_data * @@ -154,7 +153,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static void cgroup_storage_map_free(struct bpf_map *map) { - bpf_local_storage_map_free(map, &cgroup_cache, NULL); + bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy); } /* *gfp_flags* is a hidden argument provided by the verifier */ diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index a51c82dee1bd..15a3eb9b02d9 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -62,16 +62,17 @@ void bpf_inode_storage_free(struct inode *inode) if (!bsb) return; + migrate_disable(); rcu_read_lock(); local_storage = rcu_dereference(bsb->storage); - if (!local_storage) { - rcu_read_unlock(); - return; - } + if (!local_storage) + goto out; bpf_local_storage_destroy(local_storage); +out: rcu_read_unlock(); + migrate_enable(); } static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 7e6a0af0afc1..fa56c30833ff 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -81,9 +81,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; if (smap->bpf_ma) { - migrate_disable(); selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); - migrate_enable(); if (selem) /* Keep the original bpf_map_kzalloc behavior * before started using the bpf_mem_cache_alloc. @@ -174,17 +172,14 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage, return; } - if (smap) { - migrate_disable(); + if (smap) bpf_mem_cache_free(&smap->storage_ma, local_storage); - migrate_enable(); - } else { + else /* smap could be NULL if the selem that triggered * this 'local_storage' creation had been long gone. * In this case, directly do call_rcu(). */ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); - } } /* rcu tasks trace callback for bpf_ma == false */ @@ -217,7 +212,10 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) selem = container_of(rcu, struct bpf_local_storage_elem, rcu); /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); + + migrate_disable(); bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + migrate_enable(); bpf_mem_cache_raw_free(selem); } @@ -256,9 +254,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, * bpf_mem_cache_free will be able to reuse selem * immediately. */ - migrate_disable(); bpf_mem_cache_free(&smap->selem_ma, selem); - migrate_enable(); return; } @@ -497,15 +493,11 @@ int bpf_local_storage_alloc(void *owner, if (err) return err; - if (smap->bpf_ma) { - migrate_disable(); + if (smap->bpf_ma) storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); - migrate_enable(); - } else { + else storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), gfp_flags | __GFP_NOWARN); - } - if (!storage) { err = -ENOMEM; goto uncharge; @@ -841,8 +833,12 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, smap->elem_size = offsetof(struct bpf_local_storage_elem, sdata.data[attr->value_size]); - smap->bpf_ma = bpf_ma; - if (bpf_ma) { + /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non + * preemptible context. Thus, enforce all storages to use + * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled. + */ + smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma; + if (smap->bpf_ma) { err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); if (err) goto free_smap; @@ -898,15 +894,11 @@ void bpf_local_storage_map_free(struct bpf_map *map, while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { - if (busy_counter) { - migrate_disable(); + if (busy_counter) this_cpu_inc(*busy_counter); - } bpf_selem_unlink(selem, true); - if (busy_counter) { + if (busy_counter) this_cpu_dec(*busy_counter); - migrate_enable(); - } cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 606efe32485a..040fb1cd840b 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -310,6 +310,20 @@ void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc) kfree(arg_info); } +static bool is_module_member(const struct btf *btf, u32 id) +{ + const struct btf_type *t; + + t = btf_type_resolve_ptr(btf, id, NULL); + if (!t) + return false; + + if (!__btf_type_is_struct(t) && !btf_type_is_fwd(t)) + return false; + + return !strcmp(btf_name_by_offset(btf, t->name_off), "module"); +} + int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, struct btf *btf, struct bpf_verifier_log *log) @@ -389,6 +403,13 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, goto errout; } + if (!st_ops_ids[IDX_MODULE_ID] && is_module_member(btf, member->type)) { + pr_warn("'struct module' btf id not found. Is CONFIG_MODULES enabled? bpf_struct_ops '%s' needs module support.\n", + st_ops->name); + err = -EOPNOTSUPP; + goto errout; + } + func_proto = btf_type_resolve_func_ptr(btf, member->type, NULL); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index bf7fa15fdcc6..1109475953c0 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -24,22 +24,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy); static void bpf_task_storage_lock(void) { - migrate_disable(); + cant_migrate(); this_cpu_inc(bpf_task_storage_busy); } static void bpf_task_storage_unlock(void) { this_cpu_dec(bpf_task_storage_busy); - migrate_enable(); } static bool bpf_task_storage_trylock(void) { - migrate_disable(); + cant_migrate(); if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) { this_cpu_dec(bpf_task_storage_busy); - migrate_enable(); return false; } return true; @@ -72,18 +70,19 @@ void bpf_task_storage_free(struct task_struct *task) { struct bpf_local_storage *local_storage; + migrate_disable(); rcu_read_lock(); local_storage = rcu_dereference(task->bpf_storage); - if (!local_storage) { - rcu_read_unlock(); - return; - } + if (!local_storage) + goto out; bpf_task_storage_lock(); bpf_local_storage_destroy(local_storage); bpf_task_storage_unlock(); +out: rcu_read_unlock(); + migrate_enable(); } static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e5a5f023cedd..c3223e0db2f5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -498,11 +498,6 @@ bool btf_type_is_void(const struct btf_type *t) return t == &btf_void; } -static bool btf_type_is_fwd(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; -} - static bool btf_type_is_datasec(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; @@ -6512,6 +6507,8 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { /* rxrpc */ { "rxrpc_recvdata", 0x1 }, { "rxrpc_resend", 0x10 }, + /* skb */ + {"kfree_skb", 0x1000}, /* sunrpc */ { "xs_stream_read_data", 0x1 }, /* ... from xprt_cong_event event class */ @@ -7887,14 +7884,9 @@ struct btf *btf_get_by_fd(int fd) struct btf *btf; CLASS(fd, f)(fd); - if (fd_empty(f)) - return ERR_PTR(-EBADF); - - if (fd_file(f)->f_op != &btf_fops) - return ERR_PTR(-EINVAL); - - btf = fd_file(f)->private_data; - refcount_inc(&btf->refcnt); + btf = __btf_get_by_fd(f); + if (!IS_ERR(btf)) + refcount_inc(&btf->refcnt); return btf; } @@ -8011,17 +8003,6 @@ struct btf_module { static LIST_HEAD(btf_modules); static DEFINE_MUTEX(btf_module_mutex); -static ssize_t -btf_module_read(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t len) -{ - const struct btf *btf = bin_attr->private; - - memcpy(buf, btf->data + off, len); - return len; -} - static void purge_cand_cache(struct btf *btf); static int btf_module_notify(struct notifier_block *nb, unsigned long op, @@ -8082,8 +8063,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, attr->attr.name = btf->name; attr->attr.mode = 0444; attr->size = btf->data_size; - attr->private = btf; - attr->read = btf_module_read; + attr->private = btf->data; + attr->read_new = sysfs_bin_attr_simple_read; err = sysfs_create_bin_file(btf_kobj, attr); if (err) { diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a2f46785ac3b..774accbd4a22 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -190,7 +190,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, int err; rxq.dev = xdpf->dev_rx; - rxq.mem = xdpf->mem; + rxq.mem.type = xdpf->mem_type; /* TODO: report queue_index to xdp_rxq_info */ xdp_convert_frame_to_buff(xdpf, &xdp); diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 33c473d676a5..cfa1c18e3a48 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -91,9 +91,7 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) if (!refcount_dec_and_test(&cpumask->usage)) return; - migrate_disable(); bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask); - migrate_enable(); } __bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3aa002a47a96..482d284a1553 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -678,7 +678,7 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { int err; @@ -701,7 +701,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct sk_buff *nskb; int err; @@ -720,8 +720,8 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, } int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress) + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3ec941a0ea41..4a9eeb7aef85 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -824,13 +824,14 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { hlist_nulls_del_rcu(&l->hash_node); - check_and_free_fields(htab, l); bpf_map_dec_elem_count(&htab->map); break; } htab_unlock_bucket(htab, b, tgt_l->hash, flags); + if (l == tgt_l) + check_and_free_fields(htab, l); return l == tgt_l; } @@ -897,11 +898,9 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) { check_and_free_fields(htab, l); - migrate_disable(); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr); bpf_mem_cache_free(&htab->ma, l); - migrate_enable(); } static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) @@ -1502,10 +1501,9 @@ static void delete_all_elements(struct bpf_htab *htab) { int i; - /* It's called from a worker thread, so disable migration here, - * since bpf_mem_cache_free() relies on that. + /* It's called from a worker thread and migration has been disabled, + * therefore, it is OK to invoke bpf_mem_cache_free() directly. */ - migrate_disable(); for (i = 0; i < htab->n_buckets; i++) { struct hlist_nulls_head *head = select_bucket(htab, i); struct hlist_nulls_node *n; @@ -1517,7 +1515,6 @@ static void delete_all_elements(struct bpf_htab *htab) } cond_resched(); } - migrate_enable(); } static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) @@ -1638,41 +1635,44 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, l = lookup_elem_raw(head, hash, key, key_size); if (!l) { ret = -ENOENT; - } else { - if (is_percpu) { - u32 roundup_value_size = round_up(map->value_size, 8); - void __percpu *pptr; - int off = 0, cpu; + goto out_unlock; + } - pptr = htab_elem_get_ptr(l, key_size); - for_each_possible_cpu(cpu) { - copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu)); - check_and_init_map_value(&htab->map, value + off); - off += roundup_value_size; - } - } else { - u32 roundup_key_size = round_up(map->key_size, 8); + if (is_percpu) { + u32 roundup_value_size = round_up(map->value_size, 8); + void __percpu *pptr; + int off = 0, cpu; - if (flags & BPF_F_LOCK) - copy_map_value_locked(map, value, l->key + - roundup_key_size, - true); - else - copy_map_value(map, value, l->key + - roundup_key_size); - /* Zeroing special fields in the temp buffer */ - check_and_init_map_value(map, value); + pptr = htab_elem_get_ptr(l, key_size); + for_each_possible_cpu(cpu) { + copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu)); + check_and_init_map_value(&htab->map, value + off); + off += roundup_value_size; } + } else { + u32 roundup_key_size = round_up(map->key_size, 8); - hlist_nulls_del_rcu(&l->hash_node); - if (!is_lru_map) - free_htab_elem(htab, l); + if (flags & BPF_F_LOCK) + copy_map_value_locked(map, value, l->key + + roundup_key_size, + true); + else + copy_map_value(map, value, l->key + + roundup_key_size); + /* Zeroing special fields in the temp buffer */ + check_and_init_map_value(map, value); } + hlist_nulls_del_rcu(&l->hash_node); +out_unlock: htab_unlock_bucket(htab, b, hash, bflags); - if (is_lru_map && l) - htab_lru_push_free(htab, l); + if (l) { + if (is_lru_map) + htab_lru_push_free(htab, l); + else + free_htab_elem(htab, l); + } return ret; } @@ -2208,17 +2208,18 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ bool is_percpu; u64 ret = 0; + cant_migrate(); + if (flags != 0) return -EINVAL; is_percpu = htab_is_percpu(htab); roundup_key_size = round_up(map->key_size, 8); - /* disable migration so percpu value prepared here will be the - * same as the one seen by the bpf program with bpf_map_lookup_elem(). + /* migration has been disabled, so percpu value prepared here will be + * the same as the one seen by the bpf program with + * bpf_map_lookup_elem(). */ - if (is_percpu) - migrate_disable(); for (i = 0; i < htab->n_buckets; i++) { b = &htab->buckets[i]; rcu_read_lock(); @@ -2244,8 +2245,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ rcu_read_unlock(); } out: - if (is_percpu) - migrate_enable(); return num_elems; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 751c150f9e1c..672abe111282 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1284,8 +1284,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u atomic_set(&t->cancelling, 0); INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); - hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); - t->timer.function = bpf_timer_cb; + hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT); cb->value = (void *)async - map->record->timer_off; break; case BPF_ASYNC_TYPE_WQ: @@ -1593,10 +1592,24 @@ void bpf_timer_cancel_and_free(void *val) * To avoid these issues, punt to workqueue context when we are in a * timer callback. */ - if (this_cpu_read(hrtimer_running)) + if (this_cpu_read(hrtimer_running)) { queue_work(system_unbound_wq, &t->cb.delete_work); - else + return; + } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + /* If the timer is running on other CPU, also use a kworker to + * wait for the completion of the timer instead of trying to + * acquire a sleepable lock in hrtimer_cancel() to wait for its + * completion. + */ + if (hrtimer_try_to_cancel(&t->timer) >= 0) + kfree_rcu(t, cb.rcu); + else + queue_work(system_unbound_wq, &t->cb.delete_work); + } else { bpf_timer_delete_work(&t->cb.delete_work); + } } /* This function is called by map_delete/update_elem for individual element and @@ -2066,9 +2079,7 @@ unlock: /* The contained type can also have resources, including a * bpf_list_head which needs to be freed. */ - migrate_disable(); __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); - migrate_enable(); } } @@ -2105,9 +2116,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root, obj -= field->graph_root.node_offset; - migrate_disable(); __bpf_obj_drop_impl(obj, field->graph_root.value_rec, false); - migrate_enable(); } } @@ -3057,6 +3066,21 @@ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user return ret + 1; } +/* Keep unsinged long in prototype so that kfunc is usable when emitted to + * vmlinux.h in BPF programs directly, but note that while in BPF prog, the + * unsigned long always points to 8-byte region on stack, the kernel may only + * read and write the 4-bytes on 32-bit. + */ +__bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag) +{ + local_irq_save(*flags__irq_flag); +} + +__bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag) +{ + local_irq_restore(*flags__irq_flag); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3089,7 +3113,9 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_throw) +#ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) +#endif BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { @@ -3135,7 +3161,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +#ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) +#endif BTF_ID_FLAGS(func, bpf_wq_init) BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) BTF_ID_FLAGS(func, bpf_wq_start) @@ -3149,6 +3177,8 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE) BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_local_irq_save) +BTF_ID_FLAGS(func, bpf_local_irq_restore) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 9aaf5124648b..dc3aa91a6ba0 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,14 +150,14 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } -static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); inode->i_op = &bpf_dir_iops; inode->i_fop = &simple_dir_operations; @@ -166,7 +166,7 @@ static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, inc_nlink(dir); bpf_dentry_finalize(dentry, inode, dir); - return 0; + return NULL; } struct map_iter { diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 4a858fdb6476..38050f4ee400 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -537,6 +537,7 @@ static char slot_type_char[] = { [STACK_ZERO] = '0', [STACK_DYNPTR] = 'd', [STACK_ITER] = 'i', + [STACK_IRQ_FLAG] = 'f' }; static void print_liveness(struct bpf_verifier_env *env, @@ -753,9 +754,10 @@ static void print_reg_state(struct bpf_verifier_env *env, verbose(env, ")"); } -void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, - bool print_all) +void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, + u32 frameno, bool print_all) { + const struct bpf_func_state *state = vstate->frame[frameno]; const struct bpf_reg_state *reg; int i; @@ -843,11 +845,11 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st break; } } - if (state->acquired_refs && state->refs[0].id) { - verbose(env, " refs=%d", state->refs[0].id); - for (i = 1; i < state->acquired_refs; i++) - if (state->refs[i].id) - verbose(env, ",%d", state->refs[i].id); + if (vstate->acquired_refs && vstate->refs[0].id) { + verbose(env, " refs=%d", vstate->refs[0].id); + for (i = 1; i < vstate->acquired_refs; i++) + if (vstate->refs[i].id) + verbose(env, ",%d", vstate->refs[i].id); } if (state->in_callback_fn) verbose(env, " cb"); @@ -864,7 +866,8 @@ static inline u32 vlog_alignment(u32 pos) BPF_LOG_MIN_ALIGNMENT) - pos - 1; } -void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state) +void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, + u32 frameno) { if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) { /* remove new line character */ @@ -873,5 +876,5 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state } else { verbose(env, "%d:", env->insn_idx); } - print_verifier_state(env, state, false); + print_verifier_state(env, vstate, frameno, false); } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index f8bc1e096182..e8a772e64324 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -289,16 +289,11 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) } static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie, - const void *value, - bool disable_migration) + const void *value) { struct lpm_trie_node *node; - if (disable_migration) - migrate_disable(); node = bpf_mem_cache_alloc(&trie->ma); - if (disable_migration) - migrate_enable(); if (!node) return NULL; @@ -342,10 +337,8 @@ static long trie_update_elem(struct bpf_map *map, if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - /* Allocate and fill a new node. Need to disable migration before - * invoking bpf_mem_cache_alloc(). - */ - new_node = lpm_trie_node_alloc(trie, value, true); + /* Allocate and fill a new node */ + new_node = lpm_trie_node_alloc(trie, value); if (!new_node) return -ENOMEM; @@ -425,8 +418,7 @@ static long trie_update_elem(struct bpf_map *map, goto out; } - /* migration is disabled within the locked scope */ - im_node = lpm_trie_node_alloc(trie, NULL, false); + im_node = lpm_trie_node_alloc(trie, NULL); if (!im_node) { trie->n_entries--; ret = -ENOMEM; @@ -452,11 +444,9 @@ static long trie_update_elem(struct bpf_map *map, out: raw_spin_unlock_irqrestore(&trie->lock, irq_flags); - migrate_disable(); if (ret) bpf_mem_cache_free(&trie->ma, new_node); bpf_mem_cache_free_rcu(&trie->ma, free_node); - migrate_enable(); return ret; } @@ -555,10 +545,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) out: raw_spin_unlock_irqrestore(&trie->lock, irq_flags); - migrate_disable(); bpf_mem_cache_free_rcu(&trie->ma, free_parent); bpf_mem_cache_free_rcu(&trie->ma, free_node); - migrate_enable(); return ret; } diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c index 5bdf9aadca3a..37b80a23ae1a 100644 --- a/kernel/bpf/range_tree.c +++ b/kernel/bpf/range_tree.c @@ -259,9 +259,7 @@ void range_tree_destroy(struct range_tree *rt) while ((rn = range_it_iter_first(rt, 0, -1U))) { range_it_remove(rn, rt); - migrate_disable(); bpf_mem_free(&bpf_global_ma, rn); - migrate_enable(); } } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index e1cfe890e0be..1499d8caa9a3 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -268,8 +268,6 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma /* allow writable mapping for the consumer_pos only */ if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) return -EPERM; - } else { - vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, @@ -289,8 +287,6 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma * position, and the ring buffer data itself. */ return -EPERM; - } else { - vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5684e8ce132d..e1e42e918ba7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -796,11 +796,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) if (!btf_is_kernel(field->kptr.btf)) { pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, field->kptr.btf_id); - migrate_disable(); __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? pointee_struct_meta->record : NULL, fields[i].type == BPF_KPTR_PERCPU); - migrate_enable(); } else { field->kptr.dtor(xchgd_field); } @@ -835,8 +833,14 @@ static void bpf_map_free(struct bpf_map *map) struct btf_record *rec = map->record; struct btf *btf = map->btf; - /* implementation dependent freeing */ + /* implementation dependent freeing. Disabling migration to simplify + * the free of values or special fields allocated from bpf memory + * allocator. + */ + migrate_disable(); map->ops->map_free(map); + migrate_enable(); + /* Delay freeing of btf_record for maps, as map_free * callback usually needs access to them. It is better to do it here * than require each callback to do the free itself manually. @@ -1031,7 +1035,7 @@ static const struct vm_operations_struct bpf_map_default_vmops = { static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) { struct bpf_map *map = filp->private_data; - int err; + int err = 0; if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; @@ -1055,24 +1059,33 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) err = -EACCES; goto out; } + bpf_map_write_active_inc(map); } +out: + mutex_unlock(&map->freeze_mutex); + if (err) + return err; /* set default open/close callbacks */ vma->vm_ops = &bpf_map_default_vmops; vma->vm_private_data = map; vm_flags_clear(vma, VM_MAYEXEC); + /* If mapping is read-only, then disallow potentially re-mapping with + * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing + * means that as far as BPF map's memory-mapped VMAs are concerned, + * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set, + * both should be set, so we can forget about VM_MAYWRITE and always + * check just VM_WRITE + */ if (!(vma->vm_flags & VM_WRITE)) - /* disallow re-mapping with PROT_WRITE */ vm_flags_clear(vma, VM_MAYWRITE); err = map->ops->map_mmap(map, vma); - if (err) - goto out; + if (err) { + if (vma->vm_flags & VM_WRITE) + bpf_map_write_active_dec(map); + } - if (vma->vm_flags & VM_MAYWRITE) - bpf_map_write_active_inc(map); -out: - mutex_unlock(&map->freeze_mutex); return err; } @@ -1964,8 +1977,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file, return err; } -#define MAP_LOOKUP_RETRIES 3 - int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -1975,8 +1986,8 @@ int generic_map_lookup_batch(struct bpf_map *map, void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); void *buf, *buf_prevkey, *prev_key, *key, *value; - int err, retry = MAP_LOOKUP_RETRIES; u32 value_size, cp, max_count; + int err; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; @@ -2022,14 +2033,8 @@ int generic_map_lookup_batch(struct bpf_map *map, err = bpf_map_copy_value(map, key, value, attr->batch.elem_flags); - if (err == -ENOENT) { - if (retry) { - retry--; - continue; - } - err = -EINTR; - break; - } + if (err == -ENOENT) + goto next_key; if (err) goto free_buf; @@ -2044,12 +2049,12 @@ int generic_map_lookup_batch(struct bpf_map *map, goto free_buf; } + cp++; +next_key: if (!prev_key) prev_key = buf_prevkey; swap(prev_key, key); - retry = MAP_LOOKUP_RETRIES; - cp++; cond_resched(); } @@ -2730,7 +2735,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd +#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { @@ -6124,7 +6129,7 @@ static int bpf_unpriv_handler(const struct ctl_table *table, int write, return ret; } -static struct ctl_table bpf_syscall_table[] = { +static const struct ctl_table bpf_syscall_table[] = { { .procname = "unprivileged_bpf_disabled", .data = &sysctl_unprivileged_bpf_disabled, diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index fedb54c94cdb..81d6cf90584a 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -12,24 +12,16 @@ extern char __start_BTF[]; extern char __stop_BTF[]; -static ssize_t -btf_vmlinux_read(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t len) -{ - memcpy(buf, __start_BTF + off, len); - return len; -} - static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, - .read = btf_vmlinux_read, + .read_new = sysfs_bin_attr_simple_read, }; struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { + bin_attr_btf_vmlinux.private = __start_BTF; bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; if (bin_attr_btf_vmlinux.size == 0) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 77f56674aaa9..6e604caa870c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -196,7 +196,8 @@ struct bpf_verifier_stack_elem { #define BPF_PRIV_STACK_MIN_SIZE 64 -static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx); +static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); static void invalidate_non_owning_refs(struct bpf_verifier_env *env); static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); @@ -286,6 +287,7 @@ struct bpf_call_arg_meta { u32 ret_btf_id; u32 subprogno; struct btf_field *kptr_field; + s64 const_map_key; }; struct bpf_kfunc_call_arg_meta { @@ -641,6 +643,11 @@ static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, return stack_slot_obj_get_spi(env, reg, "iter", nr_slots); } +static int irq_flag_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + return stack_slot_obj_get_spi(env, reg, "irq_flag", 1); +} + static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) { switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { @@ -752,7 +759,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (clone_ref_obj_id) id = clone_ref_obj_id; else - id = acquire_reference_state(env, insn_idx); + id = acquire_reference(env, insn_idx); if (id < 0) return id; @@ -1014,7 +1021,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, if (spi < 0) return spi; - id = acquire_reference_state(env, insn_idx); + id = acquire_reference(env, insn_idx); if (id < 0) return id; @@ -1136,10 +1143,136 @@ static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_s return 0; } +static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx); +static int release_irq_state(struct bpf_verifier_state *state, int id); + +static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, + struct bpf_kfunc_call_arg_meta *meta, + struct bpf_reg_state *reg, int insn_idx) +{ + struct bpf_func_state *state = func(env, reg); + struct bpf_stack_state *slot; + struct bpf_reg_state *st; + int spi, i, id; + + spi = irq_flag_get_spi(env, reg); + if (spi < 0) + return spi; + + id = acquire_irq_state(env, insn_idx); + if (id < 0) + return id; + + slot = &state->stack[spi]; + st = &slot->spilled_ptr; + + __mark_reg_known_zero(st); + st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ + st->live |= REG_LIVE_WRITTEN; + st->ref_obj_id = id; + + for (i = 0; i < BPF_REG_SIZE; i++) + slot->slot_type[i] = STACK_IRQ_FLAG; + + mark_stack_slot_scratched(env, spi); + return 0; +} + +static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + struct bpf_stack_state *slot; + struct bpf_reg_state *st; + int spi, i, err; + + spi = irq_flag_get_spi(env, reg); + if (spi < 0) + return spi; + + slot = &state->stack[spi]; + st = &slot->spilled_ptr; + + err = release_irq_state(env->cur_state, st->ref_obj_id); + WARN_ON_ONCE(err && err != -EACCES); + if (err) { + int insn_idx = 0; + + for (int i = 0; i < env->cur_state->acquired_refs; i++) { + if (env->cur_state->refs[i].id == env->cur_state->active_irq_id) { + insn_idx = env->cur_state->refs[i].insn_idx; + break; + } + } + + verbose(env, "cannot restore irq state out of order, expected id=%d acquired at insn_idx=%d\n", + env->cur_state->active_irq_id, insn_idx); + return err; + } + + __mark_reg_not_init(env, st); + + /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ + st->live |= REG_LIVE_WRITTEN; + + for (i = 0; i < BPF_REG_SIZE; i++) + slot->slot_type[i] = STACK_INVALID; + + mark_stack_slot_scratched(env, spi); + return 0; +} + +static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + struct bpf_stack_state *slot; + int spi, i; + + /* For -ERANGE (i.e. spi not falling into allocated stack slots), we + * will do check_mem_access to check and update stack bounds later, so + * return true for that case. + */ + spi = irq_flag_get_spi(env, reg); + if (spi == -ERANGE) + return true; + if (spi < 0) + return false; + + slot = &state->stack[spi]; + + for (i = 0; i < BPF_REG_SIZE; i++) + if (slot->slot_type[i] == STACK_IRQ_FLAG) + return false; + return true; +} + +static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + struct bpf_func_state *state = func(env, reg); + struct bpf_stack_state *slot; + struct bpf_reg_state *st; + int spi, i; + + spi = irq_flag_get_spi(env, reg); + if (spi < 0) + return -EINVAL; + + slot = &state->stack[spi]; + st = &slot->spilled_ptr; + + if (!st->ref_obj_id) + return -EINVAL; + + for (i = 0; i < BPF_REG_SIZE; i++) + if (slot->slot_type[i] != STACK_IRQ_FLAG) + return -EINVAL; + return 0; +} + /* Check if given stack slot is "special": * - spilled register state (STACK_SPILL); * - dynptr state (STACK_DYNPTR); * - iter state (STACK_ITER). + * - irq flag state (STACK_IRQ_FLAG) */ static bool is_stack_slot_special(const struct bpf_stack_state *stack) { @@ -1149,6 +1282,7 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack) case STACK_SPILL: case STACK_DYNPTR: case STACK_ITER: + case STACK_IRQ_FLAG: return true; case STACK_INVALID: case STACK_MISC: @@ -1263,15 +1397,18 @@ out: return arr ? arr : ZERO_SIZE_PTR; } -static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_func_state *src) +static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src) { dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs, sizeof(struct bpf_reference_state), GFP_KERNEL); if (!dst->refs) return -ENOMEM; - dst->active_locks = src->active_locks; dst->acquired_refs = src->acquired_refs; + dst->active_locks = src->active_locks; + dst->active_preempt_locks = src->active_preempt_locks; + dst->active_rcu_lock = src->active_rcu_lock; + dst->active_irq_id = src->active_irq_id; return 0; } @@ -1288,7 +1425,7 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st return 0; } -static int resize_reference_state(struct bpf_func_state *state, size_t n) +static int resize_reference_state(struct bpf_verifier_state *state, size_t n) { state->refs = realloc_array(state->refs, state->acquired_refs, n, sizeof(struct bpf_reference_state)); @@ -1331,94 +1468,130 @@ static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state * On success, returns a valid pointer id to associate with the register * On failure, returns a negative errno. */ -static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) +static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) { - struct bpf_func_state *state = cur_func(env); + struct bpf_verifier_state *state = env->cur_state; int new_ofs = state->acquired_refs; - int id, err; + int err; err = resize_reference_state(state, state->acquired_refs + 1); if (err) - return err; - id = ++env->id_gen; - state->refs[new_ofs].type = REF_TYPE_PTR; - state->refs[new_ofs].id = id; + return NULL; state->refs[new_ofs].insn_idx = insn_idx; - return id; + return &state->refs[new_ofs]; +} + +static int acquire_reference(struct bpf_verifier_env *env, int insn_idx) +{ + struct bpf_reference_state *s; + + s = acquire_reference_state(env, insn_idx); + if (!s) + return -ENOMEM; + s->type = REF_TYPE_PTR; + s->id = ++env->id_gen; + return s->id; } static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum ref_state_type type, int id, void *ptr) { - struct bpf_func_state *state = cur_func(env); - int new_ofs = state->acquired_refs; - int err; + struct bpf_verifier_state *state = env->cur_state; + struct bpf_reference_state *s; - err = resize_reference_state(state, state->acquired_refs + 1); - if (err) - return err; - state->refs[new_ofs].type = type; - state->refs[new_ofs].id = id; - state->refs[new_ofs].insn_idx = insn_idx; - state->refs[new_ofs].ptr = ptr; + s = acquire_reference_state(env, insn_idx); + if (!s) + return -ENOMEM; + s->type = type; + s->id = id; + s->ptr = ptr; state->active_locks++; return 0; } -/* release function corresponding to acquire_reference_state(). Idempotent. */ -static int release_reference_state(struct bpf_func_state *state, int ptr_id) +static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_reference_state *s; + + s = acquire_reference_state(env, insn_idx); + if (!s) + return -ENOMEM; + s->type = REF_TYPE_IRQ; + s->id = ++env->id_gen; + + state->active_irq_id = s->id; + return s->id; +} + +static void release_reference_state(struct bpf_verifier_state *state, int idx) { - int i, last_idx; + int last_idx; + size_t rem; + /* IRQ state requires the relative ordering of elements remaining the + * same, since it relies on the refs array to behave as a stack, so that + * it can detect out-of-order IRQ restore. Hence use memmove to shift + * the array instead of swapping the final element into the deleted idx. + */ last_idx = state->acquired_refs - 1; + rem = state->acquired_refs - idx - 1; + if (last_idx && idx != last_idx) + memmove(&state->refs[idx], &state->refs[idx + 1], sizeof(*state->refs) * rem); + memset(&state->refs[last_idx], 0, sizeof(*state->refs)); + state->acquired_refs--; + return; +} + +static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr) +{ + int i; + for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].type != REF_TYPE_PTR) + if (state->refs[i].type != type) continue; - if (state->refs[i].id == ptr_id) { - if (last_idx && i != last_idx) - memcpy(&state->refs[i], &state->refs[last_idx], - sizeof(*state->refs)); - memset(&state->refs[last_idx], 0, sizeof(*state->refs)); - state->acquired_refs--; + if (state->refs[i].id == id && state->refs[i].ptr == ptr) { + release_reference_state(state, i); + state->active_locks--; return 0; } } return -EINVAL; } -static int release_lock_state(struct bpf_func_state *state, int type, int id, void *ptr) +static int release_irq_state(struct bpf_verifier_state *state, int id) { - int i, last_idx; + u32 prev_id = 0; + int i; + + if (id != state->active_irq_id) + return -EACCES; - last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].type != type) + if (state->refs[i].type != REF_TYPE_IRQ) continue; - if (state->refs[i].id == id && state->refs[i].ptr == ptr) { - if (last_idx && i != last_idx) - memcpy(&state->refs[i], &state->refs[last_idx], - sizeof(*state->refs)); - memset(&state->refs[last_idx], 0, sizeof(*state->refs)); - state->acquired_refs--; - state->active_locks--; + if (state->refs[i].id == id) { + release_reference_state(state, i); + state->active_irq_id = prev_id; return 0; + } else { + prev_id = state->refs[i].id; } } return -EINVAL; } -static struct bpf_reference_state *find_lock_state(struct bpf_verifier_env *env, enum ref_state_type type, +static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *state, enum ref_state_type type, int id, void *ptr) { - struct bpf_func_state *state = cur_func(env); int i; for (i = 0; i < state->acquired_refs; i++) { struct bpf_reference_state *s = &state->refs[i]; - if (s->type == REF_TYPE_PTR || s->type != type) + if (s->type != type) continue; if (s->id == id && s->ptr == ptr) @@ -1431,7 +1604,6 @@ static void free_func_state(struct bpf_func_state *state) { if (!state) return; - kfree(state->refs); kfree(state->stack); kfree(state); } @@ -1445,6 +1617,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, free_func_state(state->frame[i]); state->frame[i] = NULL; } + kfree(state->refs); if (free_self) kfree(state); } @@ -1455,12 +1628,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, static int copy_func_state(struct bpf_func_state *dst, const struct bpf_func_state *src) { - int err; - - memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs)); - err = copy_reference_state(dst, src); - if (err) - return err; + memcpy(dst, src, offsetof(struct bpf_func_state, stack)); return copy_stack_state(dst, src); } @@ -1477,9 +1645,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, free_func_state(dst_state->frame[i]); dst_state->frame[i] = NULL; } + err = copy_reference_state(dst_state, src); + if (err) + return err; dst_state->speculative = src->speculative; - dst_state->active_rcu_lock = src->active_rcu_lock; - dst_state->active_preempt_lock = src->active_preempt_lock; dst_state->in_sleepable = src->in_sleepable; dst_state->curframe = src->curframe; dst_state->branches = src->branches; @@ -3206,10 +3375,27 @@ static int mark_reg_read(struct bpf_verifier_env *env, return 0; } -static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int spi, int nr_slots) { struct bpf_func_state *state = func(env, reg); - int spi, ret; + int err, i; + + for (i = 0; i < nr_slots; i++) { + struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr; + + err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); + if (err) + return err; + + mark_stack_slot_scratched(env, spi - i); + } + return 0; +} + +static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + int spi; /* For CONST_PTR_TO_DYNPTR, it must have already been done by * check_reg_arg in check_helper_call and mark_btf_func_reg_size in @@ -3224,31 +3410,23 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state * * bounds and spi is the first dynptr slot. Simply mark stack slot as * read. */ - ret = mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64); - if (ret) - return ret; - return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr, - state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64); + return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS); } static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi, int nr_slots) { - struct bpf_func_state *state = func(env, reg); - int err, i; - - for (i = 0; i < nr_slots; i++) { - struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr; - - err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); - if (err) - return err; + return mark_stack_slot_obj_read(env, reg, spi, nr_slots); +} - mark_stack_slot_scratched(env, spi - i); - } +static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + int spi; - return 0; + spi = irq_flag_get_spi(env, reg); + if (spi < 0) + return spi; + return mark_stack_slot_obj_read(env, reg, spi, 1); } /* This function is supposed to be used by the following 32-bit optimization @@ -4503,7 +4681,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_frame_stack_mask(bt, fr)); verbose(env, "stack=%s: ", env->tmp_str_buf); - print_verifier_state(env, func, true); + print_verifier_state(env, st, fr, true); } } @@ -5128,7 +5306,7 @@ enum bpf_access_src { static int check_stack_range_initialized(struct bpf_verifier_env *env, int regno, int off, int access_size, bool zero_size_allowed, - enum bpf_access_src type, + enum bpf_access_type type, struct bpf_call_arg_meta *meta); static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) @@ -5161,7 +5339,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env, /* Note that we pass a NULL meta, so raw access will not be permitted. */ err = check_stack_range_initialized(env, ptr_regno, off, size, - false, ACCESS_DIRECT, NULL); + false, BPF_READ, NULL); if (err) return err; @@ -5501,13 +5679,15 @@ static bool in_sleepable(struct bpf_verifier_env *env) static bool in_rcu_cs(struct bpf_verifier_env *env) { return env->cur_state->active_rcu_lock || - cur_func(env)->active_locks || + env->cur_state->active_locks || !in_sleepable(env); } /* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */ BTF_SET_START(rcu_protected_types) +#ifdef CONFIG_NET BTF_ID(struct, prog_test_ref_kfunc) +#endif #ifdef CONFIG_CGROUPS BTF_ID(struct, cgroup) #endif @@ -5515,7 +5695,9 @@ BTF_ID(struct, cgroup) BTF_ID(struct, bpf_cpumask) #endif BTF_ID(struct, task_struct) +#ifdef CONFIG_CRYPTO BTF_ID(struct, bpf_crypto_ctx) +#endif BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) @@ -7011,7 +7193,7 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, static int check_stack_access_within_bounds( struct bpf_verifier_env *env, int regno, int off, int access_size, - enum bpf_access_src src, enum bpf_access_type type) + enum bpf_access_type type) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; @@ -7020,10 +7202,7 @@ static int check_stack_access_within_bounds( int err; char *err_extra; - if (src == ACCESS_HELPER) - /* We don't know if helpers are reading or writing (or both). */ - err_extra = " indirect access to"; - else if (type == BPF_READ) + if (type == BPF_READ) err_extra = " read from"; else err_extra = " write to"; @@ -7241,7 +7420,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == PTR_TO_STACK) { /* Basic bounds checks. */ - err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t); + err = check_stack_access_within_bounds(env, regno, off, size, t); if (err) return err; @@ -7461,13 +7640,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i static int check_stack_range_initialized( struct bpf_verifier_env *env, int regno, int off, int access_size, bool zero_size_allowed, - enum bpf_access_src type, struct bpf_call_arg_meta *meta) + enum bpf_access_type type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); int err, min_off, max_off, i, j, slot, spi; - char *err_extra = type == ACCESS_HELPER ? " indirect" : ""; - enum bpf_access_type bounds_check_type; /* Some accesses can write anything into the stack, others are * read-only. */ @@ -7478,18 +7655,10 @@ static int check_stack_range_initialized( return -EACCES; } - if (type == ACCESS_HELPER) { - /* The bounds checks for writes are more permissive than for - * reads. However, if raw_mode is not set, we'll do extra - * checks below. - */ - bounds_check_type = BPF_WRITE; + if (type == BPF_WRITE) clobber = true; - } else { - bounds_check_type = BPF_READ; - } - err = check_stack_access_within_bounds(env, regno, off, access_size, - type, bounds_check_type); + + err = check_stack_access_within_bounds(env, regno, off, access_size, type); if (err) return err; @@ -7506,8 +7675,8 @@ static int check_stack_range_initialized( char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n", - regno, err_extra, tn_buf); + verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", + regno, tn_buf); return -EACCES; } /* Only initialized buffer on stack is allowed to be accessed @@ -7560,7 +7729,7 @@ static int check_stack_range_initialized( slot = -i - 1; spi = slot / BPF_REG_SIZE; if (state->allocated_stack <= slot) { - verbose(env, "verifier bug: allocated_stack too small"); + verbose(env, "verifier bug: allocated_stack too small\n"); return -EFAULT; } @@ -7588,14 +7757,14 @@ static int check_stack_range_initialized( } if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n", - err_extra, regno, min_off, i - min_off, access_size); + verbose(env, "invalid read from stack R%d off %d+%d size %d\n", + regno, min_off, i - min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n", - err_extra, regno, tn_buf, i - min_off, access_size); + verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n", + regno, tn_buf, i - min_off, access_size); } return -EACCES; mark: @@ -7670,7 +7839,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_stack_range_initialized( env, regno, reg->off, access_size, - zero_size_allowed, ACCESS_HELPER, meta); + zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: return check_ptr_to_btf_access(env, regs, regno, reg->off, access_size, BPF_READ, -1); @@ -7835,15 +8004,15 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg * Since only one bpf_spin_lock is allowed the checks are simpler than * reg_is_refcounted() logic. The verifier needs to remember only * one spin_lock instead of array of acquired_refs. - * cur_func(env)->active_locks remembers which map value element or allocated + * env->cur_state->active_locks remembers which map value element or allocated * object got locked and clears it after bpf_spin_unlock. */ static int process_spin_lock(struct bpf_verifier_env *env, int regno, bool is_lock) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); - struct bpf_func_state *cur = cur_func(env); u64 val = reg->var_off.value; struct bpf_map *map = NULL; struct btf *btf = NULL; @@ -7910,7 +8079,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, return -EINVAL; } - if (release_lock_state(cur_func(env), REF_TYPE_LOCK, reg->id, ptr)) { + if (release_lock_state(env->cur_state, REF_TYPE_LOCK, reg->id, ptr)) { verbose(env, "bpf_spin_unlock of different lock\n"); return -EINVAL; } @@ -8982,6 +9151,69 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return 0; } +/* Returns constant key value in `value` if possible, else negative error */ +static int get_constant_map_key(struct bpf_verifier_env *env, + struct bpf_reg_state *key, + u32 key_size, + s64 *value) +{ + struct bpf_func_state *state = func(env, key); + struct bpf_reg_state *reg; + int slot, spi, off; + int spill_size = 0; + int zero_size = 0; + int stack_off; + int i, err; + u8 *stype; + + if (!env->bpf_capable) + return -EOPNOTSUPP; + if (key->type != PTR_TO_STACK) + return -EOPNOTSUPP; + if (!tnum_is_const(key->var_off)) + return -EOPNOTSUPP; + + stack_off = key->off + key->var_off.value; + slot = -stack_off - 1; + spi = slot / BPF_REG_SIZE; + off = slot % BPF_REG_SIZE; + stype = state->stack[spi].slot_type; + + /* First handle precisely tracked STACK_ZERO */ + for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--) + zero_size++; + if (zero_size >= key_size) { + *value = 0; + return 0; + } + + /* Check that stack contains a scalar spill of expected size */ + if (!is_spilled_scalar_reg(&state->stack[spi])) + return -EOPNOTSUPP; + for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--) + spill_size++; + if (spill_size != key_size) + return -EOPNOTSUPP; + + reg = &state->stack[spi].spilled_ptr; + if (!tnum_is_const(reg->var_off)) + /* Stack value not statically known */ + return -EOPNOTSUPP; + + /* We are relying on a constant value. So mark as precise + * to prevent pruning on it. + */ + bt_set_frame_slot(&env->bt, key->frameno, spi); + err = mark_chain_precision_batch(env); + if (err < 0) + return err; + + *value = reg->var_off.value; + return 0; +} + +static bool can_elide_value_nullness(enum bpf_map_type type); + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn, @@ -8992,6 +9224,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, enum bpf_arg_type arg_type = fn->arg_type[arg]; enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; + u32 key_size; int err = 0; if (arg_type == ARG_DONTCARE) @@ -9125,8 +9358,20 @@ skip_type_check: verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, - BPF_READ, false, NULL); + key_size = meta->map_ptr->key_size; + err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); + if (err) + return err; + if (can_elide_value_nullness(meta->map_ptr->map_type)) { + err = get_constant_map_key(env, reg, key_size, &meta->const_map_key); + if (err < 0) { + meta->const_map_key = -1; + if (err == -EOPNOTSUPP) + err = 0; + else + return err; + } + } break; case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) @@ -9658,21 +9903,38 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range reg->range = AT_PKT_END; } +static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id) +{ + int i; + + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].type != REF_TYPE_PTR) + continue; + if (state->refs[i].id == ref_obj_id) { + release_reference_state(state, i); + return 0; + } + } + return -EINVAL; +} + /* The pointer with the specified id has released its reference to kernel * resources. Identify all copies of the same pointer and clear the reference. + * + * This is the release function corresponding to acquire_reference(). Idempotent. */ -static int release_reference(struct bpf_verifier_env *env, - int ref_obj_id) +static int release_reference(struct bpf_verifier_env *env, int ref_obj_id) { + struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state; struct bpf_reg_state *reg; int err; - err = release_reference_state(cur_func(env), ref_obj_id); + err = release_reference_nomark(vstate, ref_obj_id); if (err) return err; - bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->ref_obj_id == ref_obj_id) mark_reg_invalid(env, reg); })); @@ -9746,9 +10008,7 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls callsite, state->curframe + 1 /* frameno within this callchain */, subprog /* subprog number within this prog */); - /* Transfer references to the callee */ - err = copy_reference_state(callee, caller); - err = err ?: set_callee_state_cb(env, caller, callee, callsite); + err = set_callee_state_cb(env, caller, callee, callsite); if (err) goto err_out; @@ -9978,19 +10238,25 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, const char *sub_name = subprog_name(env, subprog); /* Only global subprogs cannot be called with a lock held. */ - if (cur_func(env)->active_locks) { + if (env->cur_state->active_locks) { verbose(env, "global function calls are not allowed while holding a lock,\n" "use static function instead\n"); return -EINVAL; } /* Only global subprogs cannot be called with preemption disabled. */ - if (env->cur_state->active_preempt_lock) { + if (env->cur_state->active_preempt_locks) { verbose(env, "global function calls are not allowed with preemption disabled,\n" "use static function instead\n"); return -EINVAL; } + if (env->cur_state->active_irq_id) { + verbose(env, "global function calls are not allowed with IRQs disabled,\n" + "use static function instead\n"); + return -EINVAL; + } + if (err) { verbose(env, "Caller passes invalid args into func#%d ('%s')\n", subprog, sub_name); @@ -10027,9 +10293,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); - print_verifier_state(env, caller, true); + print_verifier_state(env, state, caller->frameno, true); verbose(env, "callee:\n"); - print_verifier_state(env, state->frame[state->curframe], true); + print_verifier_state(env, state, state->curframe, true); } return 0; @@ -10321,11 +10587,6 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) caller->regs[BPF_REG_0] = *r0; } - /* Transfer references to the caller */ - err = copy_reference_state(caller, callee); - if (err) - return err; - /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite, * there function call logic would reschedule callback visit. If iteration * converges is_state_visited() would prune that visit eventually. @@ -10338,9 +10599,9 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); - print_verifier_state(env, callee, true); + print_verifier_state(env, state, callee->frameno, true); verbose(env, "to caller at %d:\n", *insn_idx); - print_verifier_state(env, caller, true); + print_verifier_state(env, state, caller->frameno, true); } /* clear everything in the callee. In case of exceptional exits using * bpf_throw, this will be done by copy_verifier_state for extra frames. */ @@ -10490,11 +10751,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit) { - struct bpf_func_state *state = cur_func(env); + struct bpf_verifier_state *state = env->cur_state; bool refs_lingering = false; int i; - if (!exception_exit && state->frameno) + if (!exception_exit && cur_func(env)->frameno) return 0; for (i = 0; i < state->acquired_refs; i++) { @@ -10511,7 +10772,7 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit { int err; - if (check_lock && cur_func(env)->active_locks) { + if (check_lock && env->cur_state->active_locks) { verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix); return -EINVAL; } @@ -10522,12 +10783,17 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit return err; } + if (check_lock && env->cur_state->active_irq_id) { + verbose(env, "%s cannot be used inside bpf_local_irq_save-ed region\n", prefix); + return -EINVAL; + } + if (check_lock && env->cur_state->active_rcu_lock) { verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix); return -EINVAL; } - if (check_lock && env->cur_state->active_preempt_lock) { + if (check_lock && env->cur_state->active_preempt_locks) { verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix); return -EINVAL; } @@ -10629,6 +10895,21 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno state->callback_subprogno == subprogno); } +/* Returns whether or not the given map type can potentially elide + * lookup return value nullness check. This is possible if the key + * is statically known. + */ +static bool can_elide_value_nullness(enum bpf_map_type type) +{ + switch (type) { + case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PERCPU_ARRAY: + return true; + default: + return false; + } +} + static int get_helper_proto(struct bpf_verifier_env *env, int func_id, const struct bpf_func_proto **ptr) { @@ -10715,7 +10996,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } - if (env->cur_state->active_preempt_lock) { + if (env->cur_state->active_preempt_locks) { if (fn->might_sleep) { verbose(env, "sleepable helper %s#%d in non-preemptible region\n", func_id_name(func_id), func_id); @@ -10726,6 +11007,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn env->insn_aux_data[insn_idx].storage_get_func_atomic = true; } + if (env->cur_state->active_irq_id) { + if (fn->might_sleep) { + verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n", + func_id_name(func_id), func_id); + return -EINVAL; + } + + if (in_sleepable(env) && is_storage_get_function(func_id)) + env->insn_aux_data[insn_idx].storage_get_func_atomic = true; + } + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -10772,7 +11064,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn struct bpf_func_state *state; struct bpf_reg_state *reg; - err = release_reference_state(cur_func(env), ref_obj_id); + err = release_reference_nomark(env->cur_state, ref_obj_id); if (!err) { bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ if (reg->ref_obj_id == ref_obj_id) { @@ -10984,10 +11276,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn "kernel subsystem misconfigured verifier\n"); return -EINVAL; } + + if (func_id == BPF_FUNC_map_lookup_elem && + can_elide_value_nullness(meta.map_ptr->map_type) && + meta.const_map_key >= 0 && + meta.const_map_key < meta.map_ptr->max_entries) + ret_flag &= ~PTR_MAYBE_NULL; + regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].map_uid = meta.map_uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; - if (!type_may_be_null(ret_type) && + if (!type_may_be_null(ret_flag) && btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } @@ -11105,7 +11404,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; } else if (is_acquire_function(func_id, meta.map_ptr)) { - int id = acquire_reference_state(env, insn_idx); + int id = acquire_reference(env, insn_idx); if (id < 0) return id; @@ -11287,6 +11586,11 @@ static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param return btf_param_match_suffix(btf, arg, "__str"); } +static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param *arg) +{ + return btf_param_match_suffix(btf, arg, "__irq_flag"); +} + static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, const struct btf_param *arg, const char *name) @@ -11440,6 +11744,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_CONST_STR, KF_ARG_PTR_TO_MAP, KF_ARG_PTR_TO_WORKQUEUE, + KF_ARG_PTR_TO_IRQ_FLAG, }; enum special_kfunc_type { @@ -11471,6 +11776,11 @@ enum special_kfunc_type { KF_bpf_iter_css_task_new, KF_bpf_session_cookie, KF_bpf_get_kmem_cache, + KF_bpf_local_irq_save, + KF_bpf_local_irq_restore, + KF_bpf_iter_num_new, + KF_bpf_iter_num_next, + KF_bpf_iter_num_destroy, }; BTF_SET_START(special_kfunc_set) @@ -11486,8 +11796,10 @@ BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) +#ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +#endif BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) @@ -11515,8 +11827,13 @@ BTF_ID(func, bpf_rcu_read_unlock) BTF_ID(func, bpf_rbtree_remove) BTF_ID(func, bpf_rbtree_add_impl) BTF_ID(func, bpf_rbtree_first) +#ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +#else +BTF_ID_UNUSED +BTF_ID_UNUSED +#endif BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) BTF_ID(func, bpf_dynptr_clone) @@ -11537,6 +11854,11 @@ BTF_ID(func, bpf_session_cookie) BTF_ID_UNUSED #endif BTF_ID(func, bpf_get_kmem_cache) +BTF_ID(func, bpf_local_irq_save) +BTF_ID(func, bpf_local_irq_restore) +BTF_ID(func, bpf_iter_num_new) +BTF_ID(func, bpf_iter_num_next) +BTF_ID(func, bpf_iter_num_destroy) static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -11627,6 +11949,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_IRQ_FLAG; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -11730,11 +12055,59 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, return 0; } +static int process_irq_flag(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + bool irq_save; + int err; + + if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) { + irq_save = true; + } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) { + irq_save = false; + } else { + verbose(env, "verifier internal error: unknown irq flags kfunc\n"); + return -EFAULT; + } + + if (irq_save) { + if (!is_irq_flag_reg_valid_uninit(env, reg)) { + verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1); + return -EINVAL; + } + + err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false); + if (err) + return err; + + err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx); + if (err) + return err; + } else { + err = is_irq_flag_reg_valid_init(env, reg); + if (err) { + verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1); + return err; + } + + err = mark_irq_flag_read(env, reg); + if (err) + return err; + + err = unmark_stack_slot_irq_flag(env, reg); + if (err) + return err; + } + return 0; +} + + static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct btf_record *rec = reg_btf_record(reg); - if (!cur_func(env)->active_locks) { + if (!env->cur_state->active_locks) { verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); return -EFAULT; } @@ -11753,12 +12126,11 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id) { - struct bpf_func_state *state, *unused; + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *unused; struct bpf_reg_state *reg; int i; - state = cur_func(env); - if (!ref_obj_id) { verbose(env, "verifier internal error: ref_obj_id is zero for " "owning -> non-owning conversion\n"); @@ -11848,9 +12220,9 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ } id = reg->id; - if (!cur_func(env)->active_locks) + if (!env->cur_state->active_locks) return -EINVAL; - s = find_lock_state(env, REF_TYPE_LOCK, id, ptr); + s = find_lock_state(env->cur_state, REF_TYPE_LOCK, id, ptr); if (!s) { verbose(env, "held lock and object are not in the same allocation\n"); return -EINVAL; @@ -11873,12 +12245,24 @@ static bool is_bpf_rbtree_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_rbtree_first]; } +static bool is_bpf_iter_num_api_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_iter_num_new] || + btf_id == special_kfunc_list[KF_bpf_iter_num_next] || + btf_id == special_kfunc_list[KF_bpf_iter_num_destroy]; +} + static bool is_bpf_graph_api_kfunc(u32 btf_id) { return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) || btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]; } +static bool kfunc_spin_allowed(u32 btf_id) +{ + return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id); +} + static bool is_sync_callback_calling_kfunc(u32 btf_id) { return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]; @@ -12307,6 +12691,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_IRQ_FLAG: break; default: WARN_ON_ONCE(1); @@ -12596,6 +12981,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_IRQ_FLAG: + if (reg->type != PTR_TO_STACK) { + verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); + return -EINVAL; + } + ret = process_irq_flag(env, regno, meta); + if (ret < 0) + return ret; + break; } } @@ -12760,22 +13154,27 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return -EINVAL; } - if (env->cur_state->active_preempt_lock) { + if (env->cur_state->active_preempt_locks) { if (preempt_disable) { - env->cur_state->active_preempt_lock++; + env->cur_state->active_preempt_locks++; } else if (preempt_enable) { - env->cur_state->active_preempt_lock--; + env->cur_state->active_preempt_locks--; } else if (sleepable) { verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name); return -EACCES; } } else if (preempt_disable) { - env->cur_state->active_preempt_lock++; + env->cur_state->active_preempt_locks++; } else if (preempt_enable) { verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name); return -EINVAL; } + if (env->cur_state->active_irq_id && sleepable) { + verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name); + return -EACCES; + } + /* In case of release function, we get register number of refcounted * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ @@ -13069,7 +13468,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); if (is_kfunc_acquire(&meta)) { - int id = acquire_reference_state(env, insn_idx); + int id = acquire_reference(env, insn_idx); if (id < 0) return id; @@ -13794,64 +14193,56 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s32 smin_val = src_reg->s32_min_value; - u32 umin_val = src_reg->u32_min_value; - u32 umax_val = src_reg->u32_max_value; + s32 *dst_smin = &dst_reg->s32_min_value; + s32 *dst_smax = &dst_reg->s32_max_value; + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; + s32 tmp_prod[4]; - if (smin_val < 0 || dst_reg->s32_min_value < 0) { - /* Ain't nobody got time to multiply that sign */ - __mark_reg32_unbounded(dst_reg); - return; - } - /* Both values are positive, so we can work with unsigned and - * copy the result to signed (unless it exceeds S32_MAX). - */ - if (umax_val > U16_MAX || dst_reg->u32_max_value > U16_MAX) { - /* Potential overflow, we know nothing */ - __mark_reg32_unbounded(dst_reg); - return; + if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) || + check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) { + /* Overflow possible, we know nothing */ + *dst_umin = 0; + *dst_umax = U32_MAX; } - dst_reg->u32_min_value *= umin_val; - dst_reg->u32_max_value *= umax_val; - if (dst_reg->u32_max_value > S32_MAX) { + if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) || + check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) || + check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) || + check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) { /* Overflow possible, we know nothing */ - dst_reg->s32_min_value = S32_MIN; - dst_reg->s32_max_value = S32_MAX; + *dst_smin = S32_MIN; + *dst_smax = S32_MAX; } else { - dst_reg->s32_min_value = dst_reg->u32_min_value; - dst_reg->s32_max_value = dst_reg->u32_max_value; + *dst_smin = min_array(tmp_prod, 4); + *dst_smax = max_array(tmp_prod, 4); } } static void scalar_min_max_mul(struct bpf_reg_state *dst_reg, struct bpf_reg_state *src_reg) { - s64 smin_val = src_reg->smin_value; - u64 umin_val = src_reg->umin_value; - u64 umax_val = src_reg->umax_value; + s64 *dst_smin = &dst_reg->smin_value; + s64 *dst_smax = &dst_reg->smax_value; + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; + s64 tmp_prod[4]; - if (smin_val < 0 || dst_reg->smin_value < 0) { - /* Ain't nobody got time to multiply that sign */ - __mark_reg64_unbounded(dst_reg); - return; - } - /* Both values are positive, so we can work with unsigned and - * copy the result to signed (unless it exceeds S64_MAX). - */ - if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) { - /* Potential overflow, we know nothing */ - __mark_reg64_unbounded(dst_reg); - return; + if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) || + check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) { + /* Overflow possible, we know nothing */ + *dst_umin = 0; + *dst_umax = U64_MAX; } - dst_reg->umin_value *= umin_val; - dst_reg->umax_value *= umax_val; - if (dst_reg->umax_value > S64_MAX) { + if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) || + check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) || + check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) || + check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) { /* Overflow possible, we know nothing */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; + *dst_smin = S64_MIN; + *dst_smax = S64_MAX; } else { - dst_reg->smin_value = dst_reg->umin_value; - dst_reg->smax_value = dst_reg->umax_value; + *dst_smin = min_array(tmp_prod, 4); + *dst_smax = max_array(tmp_prod, 4); } } @@ -14462,12 +14853,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, state, true); + print_verifier_state(env, vstate, vstate->curframe, true); verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, state, true); + print_verifier_state(env, vstate, vstate->curframe, true); verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } @@ -15365,7 +15756,7 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, * No one could have freed the reference state before * doing the NULL check. */ - WARN_ON_ONCE(release_reference_state(state, id)); + WARN_ON_ONCE(release_reference_nomark(vstate, id)); bpf_for_each_reg_in_vstate(vstate, state, reg, ({ mark_ptr_or_null_reg(state, reg, id, is_null); @@ -15596,9 +15987,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (insn->code != (BPF_JMP | BPF_JCOND) || insn->src_reg != BPF_MAY_GOTO || - insn->dst_reg || insn->imm || insn->off == 0) { - verbose(env, "invalid may_goto off %d imm %d\n", - insn->off, insn->imm); + insn->dst_reg || insn->imm) { + verbose(env, "invalid may_goto imm %d\n", insn->imm); return -EINVAL; } prev_st = find_prev_entry(env, cur_st->parent, idx); @@ -15675,7 +16065,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, *insn_idx)) return -EFAULT; if (env->log.level & BPF_LOG_LEVEL) - print_insn_state(env, this_branch->frame[this_branch->curframe]); + print_insn_state(env, this_branch, this_branch->curframe); *insn_idx += insn->off; return 0; } else if (pred == 0) { @@ -15689,7 +16079,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, *insn_idx)) return -EFAULT; if (env->log.level & BPF_LOG_LEVEL) - print_insn_state(env, this_branch->frame[this_branch->curframe]); + print_insn_state(env, this_branch, this_branch->curframe); return 0; } @@ -15806,7 +16196,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EACCES; } if (env->log.level & BPF_LOG_LEVEL) - print_insn_state(env, this_branch->frame[this_branch->curframe]); + print_insn_state(env, this_branch, this_branch->curframe); return 0; } @@ -17734,6 +18124,12 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) return false; break; + case STACK_IRQ_FLAG: + old_reg = &old->stack[spi].spilled_ptr; + cur_reg = &cur->stack[spi].spilled_ptr; + if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + return false; + break; case STACK_MISC: case STACK_ZERO: case STACK_INVALID: @@ -17746,7 +18142,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return true; } -static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, +static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur, struct bpf_idmap *idmap) { int i; @@ -17754,12 +18150,25 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, if (old->acquired_refs != cur->acquired_refs) return false; + if (old->active_locks != cur->active_locks) + return false; + + if (old->active_preempt_locks != cur->active_preempt_locks) + return false; + + if (old->active_rcu_lock != cur->active_rcu_lock) + return false; + + if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap)) + return false; + for (i = 0; i < old->acquired_refs; i++) { if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) || old->refs[i].type != cur->refs[i].type) return false; switch (old->refs[i].type) { case REF_TYPE_PTR: + case REF_TYPE_IRQ: break; case REF_TYPE_LOCK: if (old->refs[i].ptr != cur->refs[i].ptr) @@ -17816,9 +18225,6 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat if (!stacksafe(env, old, cur, &env->idmap_scratch, exact)) return false; - if (!refsafe(old, cur, &env->idmap_scratch)) - return false; - return true; } @@ -17846,13 +18252,10 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->speculative && !cur->speculative) return false; - if (old->active_rcu_lock != cur->active_rcu_lock) - return false; - - if (old->active_preempt_lock != cur->active_preempt_lock) + if (old->in_sleepable != cur->in_sleepable) return false; - if (old->in_sleepable != cur->in_sleepable) + if (!refsafe(old, cur, &env->idmap_scratch)) return false; /* for states to be equal callsites have to be the same @@ -18245,9 +18648,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); verbose(env, "cur state:"); - print_verifier_state(env, cur->frame[cur->curframe], true); + print_verifier_state(env, cur, cur->curframe, true); verbose(env, "old state:"); - print_verifier_state(env, sl->state.frame[cur->curframe], true); + print_verifier_state(env, &sl->state, cur->curframe, true); return -EINVAL; } /* if the verifier is processing a loop, avoid adding new state @@ -18603,7 +19006,7 @@ static int do_check(struct bpf_verifier_env *env) env->prev_insn_idx, env->insn_idx, env->cur_state->speculative ? " (speculative execution)" : ""); - print_verifier_state(env, state->frame[state->curframe], true); + print_verifier_state(env, state, state->curframe, true); do_print_state = false; } @@ -18615,7 +19018,7 @@ static int do_check(struct bpf_verifier_env *env) }; if (verifier_state_scratched(env)) - print_insn_state(env, state->frame[state->curframe]); + print_insn_state(env, state, state->curframe); verbose_linfo(env, env->insn_idx, "; "); env->prev_log_pos = env->log.end_pos; @@ -18747,10 +19150,10 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (cur_func(env)->active_locks) { + if (env->cur_state->active_locks) { if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && - (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) { + (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) { verbose(env, "function calls are not allowed while holding a lock\n"); return -EINVAL; } @@ -18803,7 +19206,7 @@ process_bpf_exit_full: * match caller reference state when it exits. */ err = check_resource_leak(env, exception_exit, !env->cur_state->curframe, - "BPF_EXIT instruction"); + "BPF_EXIT instruction in main prog"); if (err) return err; @@ -18910,50 +19313,68 @@ static int find_btf_percpu_datasec(struct btf *btf) return -ENOENT; } +/* + * Add btf to the used_btfs array and return the index. (If the btf was + * already added, then just return the index.) Upon successful insertion + * increase btf refcnt, and, if present, also refcount the corresponding + * kernel module. + */ +static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf) +{ + struct btf_mod_pair *btf_mod; + int i; + + /* check whether we recorded this BTF (and maybe module) already */ + for (i = 0; i < env->used_btf_cnt; i++) + if (env->used_btfs[i].btf == btf) + return i; + + if (env->used_btf_cnt >= MAX_USED_BTFS) + return -E2BIG; + + btf_get(btf); + + btf_mod = &env->used_btfs[env->used_btf_cnt]; + btf_mod->btf = btf; + btf_mod->module = NULL; + + /* if we reference variables from kernel module, bump its refcount */ + if (btf_is_module(btf)) { + btf_mod->module = btf_try_get_module(btf); + if (!btf_mod->module) { + btf_put(btf); + return -ENXIO; + } + } + + return env->used_btf_cnt++; +} + /* replace pseudo btf_id with kernel symbol address */ -static int check_pseudo_btf_id(struct bpf_verifier_env *env, - struct bpf_insn *insn, - struct bpf_insn_aux_data *aux) +static int __check_pseudo_btf_id(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_insn_aux_data *aux, + struct btf *btf) { const struct btf_var_secinfo *vsi; const struct btf_type *datasec; - struct btf_mod_pair *btf_mod; const struct btf_type *t; const char *sym_name; bool percpu = false; u32 type, id = insn->imm; - struct btf *btf; s32 datasec_id; u64 addr; - int i, btf_fd, err; - - btf_fd = insn[1].imm; - if (btf_fd) { - btf = btf_get_by_fd(btf_fd); - if (IS_ERR(btf)) { - verbose(env, "invalid module BTF object FD specified.\n"); - return -EINVAL; - } - } else { - if (!btf_vmlinux) { - verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); - return -EINVAL; - } - btf = btf_vmlinux; - btf_get(btf); - } + int i; t = btf_type_by_id(btf, id); if (!t) { verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); - err = -ENOENT; - goto err_put; + return -ENOENT; } if (!btf_type_is_var(t) && !btf_type_is_func(t)) { verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id); - err = -EINVAL; - goto err_put; + return -EINVAL; } sym_name = btf_name_by_offset(btf, t->name_off); @@ -18961,8 +19382,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, if (!addr) { verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", sym_name); - err = -ENOENT; - goto err_put; + return -ENOENT; } insn[0].imm = (u32)addr; insn[1].imm = addr >> 32; @@ -18970,7 +19390,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, if (btf_type_is_func(t)) { aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; aux->btf_var.mem_size = 0; - goto check_btf; + return 0; } datasec_id = find_btf_percpu_datasec(btf); @@ -19001,8 +19421,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, tname = btf_name_by_offset(btf, t->name_off); verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", tname, PTR_ERR(ret)); - err = -EINVAL; - goto err_put; + return -EINVAL; } aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; aux->btf_var.mem_size = tsize; @@ -19011,39 +19430,43 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } -check_btf: - /* check whether we recorded this BTF (and maybe module) already */ - for (i = 0; i < env->used_btf_cnt; i++) { - if (env->used_btfs[i].btf == btf) { - btf_put(btf); - return 0; - } - } - if (env->used_btf_cnt >= MAX_USED_BTFS) { - err = -E2BIG; - goto err_put; - } + return 0; +} - btf_mod = &env->used_btfs[env->used_btf_cnt]; - btf_mod->btf = btf; - btf_mod->module = NULL; +static int check_pseudo_btf_id(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_insn_aux_data *aux) +{ + struct btf *btf; + int btf_fd; + int err; - /* if we reference variables from kernel module, bump its refcount */ - if (btf_is_module(btf)) { - btf_mod->module = btf_try_get_module(btf); - if (!btf_mod->module) { - err = -ENXIO; - goto err_put; + btf_fd = insn[1].imm; + if (btf_fd) { + CLASS(fd, f)(btf_fd); + + btf = __btf_get_by_fd(f); + if (IS_ERR(btf)) { + verbose(env, "invalid module BTF object FD specified.\n"); + return -EINVAL; + } + } else { + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; } + btf = btf_vmlinux; } - env->used_btf_cnt++; + err = __check_pseudo_btf_id(env, insn, aux, btf); + if (err) + return err; + err = __add_used_btf(env, btf); + if (err < 0) + return err; return 0; -err_put: - btf_put(btf); - return err; } static bool is_tracing_prog_type(enum bpf_prog_type type) @@ -19060,6 +19483,12 @@ static bool is_tracing_prog_type(enum bpf_prog_type type) } } +static bool bpf_map_is_cgroup_storage(struct bpf_map *map) +{ + return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); +} + static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) @@ -19138,39 +19567,47 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } - return 0; -} + if (bpf_map_is_cgroup_storage(map) && + bpf_cgroup_storage_assign(env->prog->aux, map)) { + verbose(env, "only one cgroup storage of each type is allowed\n"); + return -EBUSY; + } -static bool bpf_map_is_cgroup_storage(struct bpf_map *map) -{ - return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || - map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); + if (map->map_type == BPF_MAP_TYPE_ARENA) { + if (env->prog->aux->arena) { + verbose(env, "Only one arena per program\n"); + return -EBUSY; + } + if (!env->allow_ptr_leaks || !env->bpf_capable) { + verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n"); + return -EPERM; + } + if (!env->prog->jit_requested) { + verbose(env, "JIT is required to use arena\n"); + return -EOPNOTSUPP; + } + if (!bpf_jit_supports_arena()) { + verbose(env, "JIT doesn't support arena\n"); + return -EOPNOTSUPP; + } + env->prog->aux->arena = (void *)map; + if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) { + verbose(env, "arena's user address must be set via map_extra or mmap()\n"); + return -EINVAL; + } + } + + return 0; } -/* Add map behind fd to used maps list, if it's not already there, and return - * its index. Also set *reused to true if this map was already in the list of - * used maps. - * Returns <0 on error, or >= 0 index, on success. - */ -static int add_used_map_from_fd(struct bpf_verifier_env *env, int fd, bool *reused) +static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map) { - CLASS(fd, f)(fd); - struct bpf_map *map; - int i; - - map = __bpf_map_get(f); - if (IS_ERR(map)) { - verbose(env, "fd %d is not pointing to valid bpf_map\n", fd); - return PTR_ERR(map); - } + int i, err; /* check whether we recorded this map already */ - for (i = 0; i < env->used_map_cnt; i++) { - if (env->used_maps[i] == map) { - *reused = true; + for (i = 0; i < env->used_map_cnt; i++) + if (env->used_maps[i] == map) return i; - } - } if (env->used_map_cnt >= MAX_USED_MAPS) { verbose(env, "The total number of maps per program has reached the limit of %u\n", @@ -19178,6 +19615,10 @@ static int add_used_map_from_fd(struct bpf_verifier_env *env, int fd, bool *reus return -E2BIG; } + err = check_map_prog_compatibility(env, map, env->prog); + if (err) + return err; + if (env->prog->sleepable) atomic64_inc(&map->sleepable_refcnt); @@ -19188,12 +19629,29 @@ static int add_used_map_from_fd(struct bpf_verifier_env *env, int fd, bool *reus */ bpf_map_inc(map); - *reused = false; env->used_maps[env->used_map_cnt++] = map; return env->used_map_cnt - 1; } +/* Add map behind fd to used maps list, if it's not already there, and return + * its index. + * Returns <0 on error, or >= 0 index, on success. + */ +static int add_used_map(struct bpf_verifier_env *env, int fd) +{ + struct bpf_map *map; + CLASS(fd, f)(fd); + + map = __bpf_map_get(f); + if (IS_ERR(map)) { + verbose(env, "fd %d is not pointing to valid bpf_map\n", fd); + return PTR_ERR(map); + } + + return __add_used_map(env, map); +} + /* find and rewrite pseudo imm in ld_imm64 instructions: * * 1. if it accesses map FD, replace it with actual map pointer. @@ -19225,7 +19683,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) int map_idx; u64 addr; u32 fd; - bool reused; if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || @@ -19286,7 +19743,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) break; } - map_idx = add_used_map_from_fd(env, fd, &reused); + map_idx = add_used_map(env, fd); if (map_idx < 0) return map_idx; map = env->used_maps[map_idx]; @@ -19294,10 +19751,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) aux = &env->insn_aux_data[i]; aux->map_index = map_idx; - err = check_map_prog_compatibility(env, map, env->prog); - if (err) - return err; - if (insn[0].src_reg == BPF_PSEUDO_MAP_FD || insn[0].src_reg == BPF_PSEUDO_MAP_IDX) { addr = (unsigned long)map; @@ -19328,39 +19781,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) insn[0].imm = (u32)addr; insn[1].imm = addr >> 32; - /* proceed with extra checks only if its newly added used map */ - if (reused) - goto next_insn; - - if (bpf_map_is_cgroup_storage(map) && - bpf_cgroup_storage_assign(env->prog->aux, map)) { - verbose(env, "only one cgroup storage of each type is allowed\n"); - return -EBUSY; - } - if (map->map_type == BPF_MAP_TYPE_ARENA) { - if (env->prog->aux->arena) { - verbose(env, "Only one arena per program\n"); - return -EBUSY; - } - if (!env->allow_ptr_leaks || !env->bpf_capable) { - verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n"); - return -EPERM; - } - if (!env->prog->jit_requested) { - verbose(env, "JIT is required to use arena\n"); - return -EOPNOTSUPP; - } - if (!bpf_jit_supports_arena()) { - verbose(env, "JIT doesn't support arena\n"); - return -EOPNOTSUPP; - } - env->prog->aux->arena = (void *)map; - if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) { - verbose(env, "arena's user address must be set via map_extra or mmap()\n"); - return -EINVAL; - } - } - next_insn: insn++; i++; @@ -19779,23 +20199,28 @@ static int opt_remove_dead_code(struct bpf_verifier_env *env) } static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0); +static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0); static int opt_remove_nops(struct bpf_verifier_env *env) { - const struct bpf_insn ja = NOP; struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; + bool is_may_goto_0, is_ja; int i, err; for (i = 0; i < insn_cnt; i++) { - if (memcmp(&insn[i], &ja, sizeof(ja))) + is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0)); + is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP)); + + if (!is_may_goto_0 && !is_ja) continue; err = verifier_remove_insns(env, i, 1); if (err) return err; insn_cnt--; - i--; + /* Go back one insn to catch may_goto +1; may_goto +0 sequence */ + i -= (is_may_goto_0 && i > 0) ? 2 : 1; } return 0; @@ -21277,12 +21702,12 @@ patch_map_ops_generic: if (insn->imm == BPF_FUNC_get_smp_processor_id && verifier_inlines_helper_call(env, insn->imm)) { /* BPF_FUNC_get_smp_processor_id inlining is an - * optimization, so if pcpu_hot.cpu_number is ever + * optimization, so if cpu_number is ever * changed in some incompatible and hard to support * way, it's fine to back out this inlining logic */ #ifdef CONFIG_SMP - insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number); insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); cnt = 3; @@ -22544,6 +22969,73 @@ struct btf *bpf_get_btf_vmlinux(void) return btf_vmlinux; } +/* + * The add_fd_from_fd_array() is executed only if fd_array_cnt is non-zero. In + * this case expect that every file descriptor in the array is either a map or + * a BTF. Everything else is considered to be trash. + */ +static int add_fd_from_fd_array(struct bpf_verifier_env *env, int fd) +{ + struct bpf_map *map; + struct btf *btf; + CLASS(fd, f)(fd); + int err; + + map = __bpf_map_get(f); + if (!IS_ERR(map)) { + err = __add_used_map(env, map); + if (err < 0) + return err; + return 0; + } + + btf = __btf_get_by_fd(f); + if (!IS_ERR(btf)) { + err = __add_used_btf(env, btf); + if (err < 0) + return err; + return 0; + } + + verbose(env, "fd %d is not pointing to valid bpf_map or btf\n", fd); + return PTR_ERR(map); +} + +static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, bpfptr_t uattr) +{ + size_t size = sizeof(int); + int ret; + int fd; + u32 i; + + env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel); + + /* + * The only difference between old (no fd_array_cnt is given) and new + * APIs is that in the latter case the fd_array is expected to be + * continuous and is scanned for map fds right away + */ + if (!attr->fd_array_cnt) + return 0; + + /* Check for integer overflow */ + if (attr->fd_array_cnt >= (U32_MAX / size)) { + verbose(env, "fd_array_cnt is too big (%u)\n", attr->fd_array_cnt); + return -EINVAL; + } + + for (i = 0; i < attr->fd_array_cnt; i++) { + if (copy_from_bpfptr_offset(&fd, env->fd_array, i * size, size)) + return -EFAULT; + + ret = add_fd_from_fd_array(env, fd); + if (ret) + return ret; + } + + return 0; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); @@ -22575,7 +23067,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; - env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel); env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token); env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token); @@ -22598,6 +23089,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret) goto err_unlock; + ret = process_fd_array(env, attr, uattr); + if (ret) + goto skip_full_check; + mark_verifier_state_clean(env); if (IS_ERR(btf_vmlinux)) { diff --git a/kernel/capability.c b/kernel/capability.c index dac4df77e376..e089d2628c29 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -38,10 +38,8 @@ __setup("no_file_caps", file_caps_disable); static void warn_legacy_capability_use(void) { - char name[sizeof(current->comm)]; - pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", - get_task_comm(name, current)); + current->comm); } /* @@ -62,10 +60,8 @@ static void warn_legacy_capability_use(void) static void warn_deprecated_v2(void) { - char name[sizeof(current->comm)]; - pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", - get_task_comm(name, current)); + current->comm); } /* diff --git a/kernel/cfi.c b/kernel/cfi.c index 08caad776717..19be79639542 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -7,6 +7,8 @@ #include <linux/cfi.h> +bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE); + enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, unsigned long *target, u32 type) { @@ -17,7 +19,7 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, pr_err("CFI failure at %pS (no target information)\n", (void *)addr); - if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) { + if (cfi_warn) { __warn(NULL, 0, (void *)addr, 0, regs, NULL); return BUG_TRAP_TYPE_WARN; } diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index a5c9359d516f..ede31601a363 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -7,4 +7,5 @@ obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o obj-$(CONFIG_CGROUP_MISC) += misc.o +obj-$(CONFIG_CGROUP_DMEM) += dmem.o obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c964dd7ff967..95ab39e1ec8f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -168,6 +168,7 @@ struct cgroup_mgctx { extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; +extern bool cgrp_dfl_visible; /* iterate across the hierarchies */ #define for_each_root(root) \ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index e28d5f0d20ed..11ea8d24ac72 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -673,6 +673,7 @@ struct cftype cgroup1_base_files[] = { int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; + bool cgrp_v1_visible = false; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); @@ -684,12 +685,18 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) { if (cgroup1_subsys_absent(ss)) continue; + cgrp_v1_visible |= ss->root != &cgrp_dfl_root; + seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); } + if (cgrp_dfl_visible && !cgrp_v1_visible) + pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n"); + + return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d9061bd55436..f231fe3a0744 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_visible; +bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; @@ -4013,7 +4013,7 @@ static void __cgroup_kill(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - set_bit(CGRP_KILL, &cgrp->flags); + cgrp->kill_seq++; spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); @@ -4029,10 +4029,6 @@ static void __cgroup_kill(struct cgroup *cgrp) send_sig(SIGKILL, task, 0); } css_task_iter_end(&it); - - spin_lock_irq(&css_set_lock); - clear_bit(CGRP_KILL, &cgrp->flags); - spin_unlock_irq(&css_set_lock); } static void cgroup_kill(struct cgroup *cgrp) @@ -4451,7 +4447,7 @@ int cgroup_rm_cftypes(struct cftype *cfts) * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; @@ -5835,7 +5831,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) } /* - * This extra ref will be put in cgroup_free_fn() and guarantees + * This extra ref will be put in css_free_rwork_fn() and guarantees * that @cgrp->kn is always accessible. */ kernfs_get(cgrp->kn); @@ -6488,6 +6484,10 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); + if (kargs->cgrp) + kargs->kill_seq = kargs->cgrp->kill_seq; + else + kargs->kill_seq = cset->dfl_cgrp->kill_seq; spin_unlock_irq(&css_set_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { @@ -6668,6 +6668,7 @@ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + unsigned int cgrp_kill_seq = 0; unsigned long cgrp_flags = 0; bool kill = false; struct cgroup_subsys *ss; @@ -6681,10 +6682,13 @@ void cgroup_post_fork(struct task_struct *child, /* init tasks are special, only link regular threads */ if (likely(child->pid)) { - if (kargs->cgrp) + if (kargs->cgrp) { cgrp_flags = kargs->cgrp->flags; - else + cgrp_kill_seq = kargs->cgrp->kill_seq; + } else { cgrp_flags = cset->dfl_cgrp->flags; + cgrp_kill_seq = cset->dfl_cgrp->kill_seq; + } WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; @@ -6719,7 +6723,7 @@ void cgroup_post_fork(struct task_struct *child, * child down right after we finished preparing it for * userspace. */ - kill = test_bit(CGRP_KILL, &cgrp_flags); + kill = kargs->kill_seq != cgrp_kill_seq; } spin_unlock_irq(&css_set_lock); diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 25c1d7b77e2f..b69a7db67090 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include "cgroup-internal.h" #include "cpuset-internal.h" /* @@ -175,6 +176,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = update_relax_domain_level(cs, val); break; default: @@ -373,6 +375,46 @@ out: return ret; } +#ifdef CONFIG_PROC_PID_CPUSET +/* + * proc_cpuset_show() + * - Print tasks cpuset path into seq_file. + * - Used for /proc/<pid>/cpuset. + */ +int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + char *buf; + struct cgroup_subsys_state *css; + int retval; + + retval = -ENOMEM; + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + goto out; + + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + + if (retval == -E2BIG) + retval = -ENAMETOOLONG; + if (retval < 0) + goto out_free; + seq_puts(m, buf); + seq_putc(m, '\n'); + retval = 0; +out_free: + kfree(buf); +out: + return retval; +} +#endif /* CONFIG_PROC_PID_CPUSET */ + static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct cpuset *cs = css_cs(css); @@ -424,24 +466,31 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); break; case FILE_MEM_HARDWALL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); break; case FILE_SCHED_LOAD_BALANCE: + pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name); retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: + pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name); cpuset_memory_pressure_enabled = !!val; break; case FILE_SPREAD_PAGE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); break; case FILE_SPREAD_SLAB: + pr_warn_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); break; default: diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0f910c828973..39c1fc643d77 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,7 +21,6 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ -#include "cgroup-internal.h" #include "cpuset-internal.h" #include <linux/init.h> @@ -954,10 +953,12 @@ static void dl_update_tasks_root_domain(struct cpuset *cs) css_task_iter_end(&it); } -static void dl_rebuild_rd_accounting(void) +void dl_rebuild_rd_accounting(void) { struct cpuset *cs = NULL; struct cgroup_subsys_state *pos_css; + int cpu; + u64 cookie = ++dl_cookie; lockdep_assert_held(&cpuset_mutex); lockdep_assert_cpus_held(); @@ -965,11 +966,12 @@ static void dl_rebuild_rd_accounting(void) rcu_read_lock(); - /* - * Clear default root domain DL accounting, it will be computed again - * if a task belongs to it. - */ - dl_clear_root_domain(&def_root_domain); + for_each_possible_cpu(cpu) { + if (dl_bw_visited(cpu, cookie)) + continue; + + dl_clear_root_domain_cpu(cpu); + } cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { @@ -990,16 +992,6 @@ static void dl_rebuild_rd_accounting(void) rcu_read_unlock(); } -static void -partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ - mutex_lock(&sched_domains_mutex); - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - dl_rebuild_rd_accounting(); - mutex_unlock(&sched_domains_mutex); -} - /* * Rebuild scheduler domains. * @@ -1061,7 +1053,7 @@ void rebuild_sched_domains_locked(void) ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); + partition_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ void rebuild_sched_domains_locked(void) @@ -1083,6 +1075,13 @@ void rebuild_sched_domains(void) cpus_read_unlock(); } +void cpuset_reset_sched_domains(void) +{ + mutex_lock(&cpuset_mutex); + partition_sched_domains(1, NULL, NULL); + mutex_unlock(&cpuset_mutex); +} + /** * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -4244,50 +4243,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -#ifdef CONFIG_PROC_PID_CPUSET -/* - * proc_cpuset_show() - * - Print tasks cpuset path into seq_file. - * - Used for /proc/<pid>/cpuset. - * - No need to task_lock(tsk) on this tsk->cpuset reference, as it - * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it - * anyway. - */ -int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - char *buf; - struct cgroup_subsys_state *css; - int retval; - - retval = -ENOMEM; - buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!buf) - goto out; - - rcu_read_lock(); - spin_lock_irq(&css_set_lock); - css = task_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - spin_unlock_irq(&css_set_lock); - rcu_read_unlock(); - - if (retval == -E2BIG) - retval = -ENAMETOOLONG; - if (retval < 0) - goto out_free; - seq_puts(m, buf); - seq_putc(m, '\n'); - retval = 0; -out_free: - kfree(buf); -out: - return retval; -} -#endif /* CONFIG_PROC_PID_CPUSET */ - /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c new file mode 100644 index 000000000000..10b63433f057 --- /dev/null +++ b/kernel/cgroup/dmem.c @@ -0,0 +1,829 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>) + * Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>) + * Partially based on the rdma and misc controllers, which bear the following copyrights: + * + * Copyright 2020 Google LLC + * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> + */ + +#include <linux/cgroup.h> +#include <linux/cgroup_dmem.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/page_counter.h> +#include <linux/parser.h> +#include <linux/slab.h> + +struct dmem_cgroup_region { + /** + * @ref: References keeping the region alive. + * Keeps the region reference alive after a succesful RCU lookup. + */ + struct kref ref; + + /** @rcu: RCU head for freeing */ + struct rcu_head rcu; + + /** + * @region_node: Linked into &dmem_cgroup_regions list. + * Protected by RCU and global spinlock. + */ + struct list_head region_node; + + /** + * @pools: List of pools linked to this region. + * Protected by global spinlock only + */ + struct list_head pools; + + /** @size: Size of region, in bytes */ + u64 size; + + /** @name: Name describing the node, set by dmem_cgroup_register_region */ + char *name; + + /** + * @unregistered: Whether the region is unregistered by its caller. + * No new pools should be added to the region afterwards. + */ + bool unregistered; +}; + +struct dmemcg_state { + struct cgroup_subsys_state css; + + struct list_head pools; +}; + +struct dmem_cgroup_pool_state { + struct dmem_cgroup_region *region; + struct dmemcg_state *cs; + + /* css node, RCU protected against region teardown */ + struct list_head css_node; + + /* dev node, no RCU protection required */ + struct list_head region_node; + + struct rcu_head rcu; + + struct page_counter cnt; + + bool inited; +}; + +/* + * 3 operations require locking protection: + * - Registering and unregistering region to/from list, requires global lock. + * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed. + * - Adding a dmem_cgroup_pool_state to a region list. + * + * Since for the most common operations RCU provides enough protection, I + * do not think more granular locking makes sense. Most protection is offered + * by RCU and the lockless operating page_counter. + */ +static DEFINE_SPINLOCK(dmemcg_lock); +static LIST_HEAD(dmem_cgroup_regions); + +static inline struct dmemcg_state * +css_to_dmemcs(struct cgroup_subsys_state *css) +{ + return container_of(css, struct dmemcg_state, css); +} + +static inline struct dmemcg_state *get_current_dmemcs(void) +{ + return css_to_dmemcs(task_get_css(current, dmem_cgrp_id)); +} + +static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg) +{ + return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL; +} + +static void free_cg_pool(struct dmem_cgroup_pool_state *pool) +{ + list_del(&pool->region_node); + kfree(pool); +} + +static void +set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val) +{ + page_counter_set_min(&pool->cnt, val); +} + +static void +set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val) +{ + page_counter_set_low(&pool->cnt, val); +} + +static void +set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val) +{ + page_counter_set_max(&pool->cnt, val); +} + +static u64 get_resource_low(struct dmem_cgroup_pool_state *pool) +{ + return pool ? READ_ONCE(pool->cnt.low) : 0; +} + +static u64 get_resource_min(struct dmem_cgroup_pool_state *pool) +{ + return pool ? READ_ONCE(pool->cnt.min) : 0; +} + +static u64 get_resource_max(struct dmem_cgroup_pool_state *pool) +{ + return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX; +} + +static u64 get_resource_current(struct dmem_cgroup_pool_state *pool) +{ + return pool ? page_counter_read(&pool->cnt) : 0; +} + +static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool) +{ + set_resource_min(rpool, 0); + set_resource_low(rpool, 0); + set_resource_max(rpool, PAGE_COUNTER_MAX); +} + +static void dmemcs_offline(struct cgroup_subsys_state *css) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(css); + struct dmem_cgroup_pool_state *pool; + + rcu_read_lock(); + list_for_each_entry_rcu(pool, &dmemcs->pools, css_node) + reset_all_resource_limits(pool); + rcu_read_unlock(); +} + +static void dmemcs_free(struct cgroup_subsys_state *css) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(css); + struct dmem_cgroup_pool_state *pool, *next; + + spin_lock(&dmemcg_lock); + list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) { + /* + *The pool is dead and all references are 0, + * no need for RCU protection with list_del_rcu or freeing. + */ + list_del(&pool->css_node); + free_cg_pool(pool); + } + spin_unlock(&dmemcg_lock); + + kfree(dmemcs); +} + +static struct cgroup_subsys_state * +dmemcs_alloc(struct cgroup_subsys_state *parent_css) +{ + struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL); + if (!dmemcs) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&dmemcs->pools); + return &dmemcs->css; +} + +static struct dmem_cgroup_pool_state * +find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region) +{ + struct dmem_cgroup_pool_state *pool; + + list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock)) + if (pool->region == region) + return pool; + + return NULL; +} + +static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool) +{ + if (!pool->cnt.parent) + return NULL; + + return container_of(pool->cnt.parent, typeof(*pool), cnt); +} + +static void +dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool) +{ + struct page_counter *climit; + struct cgroup_subsys_state *css; + struct dmemcg_state *dmemcg_iter; + struct dmem_cgroup_pool_state *pool, *found_pool; + + climit = &limit_pool->cnt; + + rcu_read_lock(); + + css_for_each_descendant_pre(css, &limit_pool->cs->css) { + dmemcg_iter = container_of(css, struct dmemcg_state, css); + found_pool = NULL; + + list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) { + if (pool->region == limit_pool->region) { + found_pool = pool; + break; + } + } + if (!found_pool) + continue; + + page_counter_calculate_protection( + climit, &found_pool->cnt, true); + + if (found_pool == test_pool) + break; + } + rcu_read_unlock(); +} + +/** + * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool + * @limit_pool: The pool for which we hit limits + * @test_pool: The pool for which to test + * @ignore_low: Whether we have to respect low watermarks. + * @ret_hit_low: Pointer to whether it makes sense to consider low watermark. + * + * This function returns true if we can evict from @test_pool, false if not. + * When returning false and @ignore_low is false, @ret_hit_low may + * be set to true to indicate this function can be retried with @ignore_low + * set to true. + * + * Return: bool + */ +bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool, + bool ignore_low, bool *ret_hit_low) +{ + struct dmem_cgroup_pool_state *pool = test_pool; + struct page_counter *ctest; + u64 used, min, low; + + /* Can always evict from current pool, despite limits */ + if (limit_pool == test_pool) + return true; + + if (limit_pool) { + if (!parent_dmemcs(limit_pool->cs)) + return true; + + for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool)) + {} + + if (!pool) + return false; + } else { + /* + * If there is no cgroup limiting memory usage, use the root + * cgroup instead for limit calculations. + */ + for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool)) + {} + } + + ctest = &test_pool->cnt; + + dmem_cgroup_calculate_protection(limit_pool, test_pool); + + used = page_counter_read(ctest); + min = READ_ONCE(ctest->emin); + + if (used <= min) + return false; + + if (!ignore_low) { + low = READ_ONCE(ctest->elow); + if (used > low) + return true; + + *ret_hit_low = true; + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable); + +static struct dmem_cgroup_pool_state * +alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region, + struct dmem_cgroup_pool_state **allocpool) +{ + struct dmemcg_state *parent = parent_dmemcs(dmemcs); + struct dmem_cgroup_pool_state *pool, *ppool = NULL; + + if (!*allocpool) { + pool = kzalloc(sizeof(*pool), GFP_NOWAIT); + if (!pool) + return ERR_PTR(-ENOMEM); + } else { + pool = *allocpool; + *allocpool = NULL; + } + + pool->region = region; + pool->cs = dmemcs; + + if (parent) + ppool = find_cg_pool_locked(parent, region); + + page_counter_init(&pool->cnt, + ppool ? &ppool->cnt : NULL, true); + reset_all_resource_limits(pool); + + list_add_tail_rcu(&pool->css_node, &dmemcs->pools); + list_add_tail(&pool->region_node, ®ion->pools); + + if (!parent) + pool->inited = true; + else + pool->inited = ppool ? ppool->inited : false; + return pool; +} + +static struct dmem_cgroup_pool_state * +get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region, + struct dmem_cgroup_pool_state **allocpool) +{ + struct dmem_cgroup_pool_state *pool, *ppool, *retpool; + struct dmemcg_state *p, *pp; + + /* + * Recursively create pool, we may not initialize yet on + * recursion, this is done as a separate step. + */ + for (p = dmemcs; p; p = parent_dmemcs(p)) { + pool = find_cg_pool_locked(p, region); + if (!pool) + pool = alloc_pool_single(p, region, allocpool); + + if (IS_ERR(pool)) + return pool; + + if (p == dmemcs && pool->inited) + return pool; + + if (pool->inited) + break; + } + + retpool = pool = find_cg_pool_locked(dmemcs, region); + for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) { + if (pool->inited) + break; + + /* ppool was created if it didn't exist by above loop. */ + ppool = find_cg_pool_locked(pp, region); + + /* Fix up parent links, mark as inited. */ + pool->cnt.parent = &ppool->cnt; + pool->inited = true; + + pool = ppool; + } + + return retpool; +} + +static void dmemcg_free_rcu(struct rcu_head *rcu) +{ + struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu); + struct dmem_cgroup_pool_state *pool, *next; + + list_for_each_entry_safe(pool, next, ®ion->pools, region_node) + free_cg_pool(pool); + kfree(region->name); + kfree(region); +} + +static void dmemcg_free_region(struct kref *ref) +{ + struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref); + + call_rcu(&cgregion->rcu, dmemcg_free_rcu); +} + +/** + * dmem_cgroup_unregister_region() - Unregister a previously registered region. + * @region: The region to unregister. + * + * This function undoes dmem_cgroup_register_region. + */ +void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) +{ + struct list_head *entry; + + if (!region) + return; + + spin_lock(&dmemcg_lock); + + /* Remove from global region list */ + list_del_rcu(®ion->region_node); + + list_for_each_rcu(entry, ®ion->pools) { + struct dmem_cgroup_pool_state *pool = + container_of(entry, typeof(*pool), region_node); + + list_del_rcu(&pool->css_node); + } + + /* + * Ensure any RCU based lookups fail. Additionally, + * no new pools should be added to the dead region + * by get_cg_pool_unlocked. + */ + region->unregistered = true; + spin_unlock(&dmemcg_lock); + + kref_put(®ion->ref, dmemcg_free_region); +} +EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region); + +/** + * dmem_cgroup_register_region() - Register a regions for dev cgroup. + * @size: Size of region to register, in bytes. + * @fmt: Region parameters to register + * + * This function registers a node in the dmem cgroup with the + * name given. After calling this function, the region can be + * used for allocations. + * + * Return: NULL or a struct on success, PTR_ERR on failure. + */ +struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...) +{ + struct dmem_cgroup_region *ret; + char *region_name; + va_list ap; + + if (!size) + return NULL; + + va_start(ap, fmt); + region_name = kvasprintf(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!region_name) + return ERR_PTR(-ENOMEM); + + ret = kzalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) { + kfree(region_name); + return ERR_PTR(-ENOMEM); + } + + INIT_LIST_HEAD(&ret->pools); + ret->name = region_name; + ret->size = size; + kref_init(&ret->ref); + + spin_lock(&dmemcg_lock); + list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions); + spin_unlock(&dmemcg_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(dmem_cgroup_register_region); + +static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name) +{ + struct dmem_cgroup_region *region; + + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock)) + if (!strcmp(name, region->name) && + kref_get_unless_zero(®ion->ref)) + return region; + + return NULL; +} + +/** + * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state + * @pool: &dmem_cgroup_pool_state + * + * Called to drop a reference to the limiting pool returned by + * dmem_cgroup_try_charge(). + */ +void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) +{ + if (pool) + css_put(&pool->cs->css); +} +EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put); + +static struct dmem_cgroup_pool_state * +get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) +{ + struct dmem_cgroup_pool_state *pool, *allocpool = NULL; + + /* fastpath lookup? */ + rcu_read_lock(); + pool = find_cg_pool_locked(cg, region); + if (pool && !READ_ONCE(pool->inited)) + pool = NULL; + rcu_read_unlock(); + + while (!pool) { + spin_lock(&dmemcg_lock); + if (!region->unregistered) + pool = get_cg_pool_locked(cg, region, &allocpool); + else + pool = ERR_PTR(-ENODEV); + spin_unlock(&dmemcg_lock); + + if (pool == ERR_PTR(-ENOMEM)) { + pool = NULL; + if (WARN_ON(allocpool)) + continue; + + allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL); + if (allocpool) { + pool = NULL; + continue; + } + } + } + + kfree(allocpool); + return pool; +} + +/** + * dmem_cgroup_uncharge() - Uncharge a pool. + * @pool: Pool to uncharge. + * @size: Size to uncharge. + * + * Undoes the effects of dmem_cgroup_try_charge. + * Must be called with the returned pool as argument, + * and same @index and @size. + */ +void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) +{ + if (!pool) + return; + + page_counter_uncharge(&pool->cnt, size); + css_put(&pool->cs->css); +} +EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge); + +/** + * dmem_cgroup_try_charge() - Try charging a new allocation to a region. + * @region: dmem region to charge + * @size: Size (in bytes) to charge. + * @ret_pool: On succesfull allocation, the pool that is charged. + * @ret_limit_pool: On a failed allocation, the limiting pool. + * + * This function charges the @region region for a size of @size bytes. + * + * If the function succeeds, @ret_pool is set, which must be passed to + * dmem_cgroup_uncharge() when undoing the allocation. + * + * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it + * will be set to the pool for which the limit is hit. This can be used for + * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed + * with @dmem_cgroup_pool_state_put(). + * + * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure. + */ +int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, + struct dmem_cgroup_pool_state **ret_pool, + struct dmem_cgroup_pool_state **ret_limit_pool) +{ + struct dmemcg_state *cg; + struct dmem_cgroup_pool_state *pool; + struct page_counter *fail; + int ret; + + *ret_pool = NULL; + if (ret_limit_pool) + *ret_limit_pool = NULL; + + /* + * hold on to css, as cgroup can be removed but resource + * accounting happens on css. + */ + cg = get_current_dmemcs(); + + pool = get_cg_pool_unlocked(cg, region); + if (IS_ERR(pool)) { + ret = PTR_ERR(pool); + goto err; + } + + if (!page_counter_try_charge(&pool->cnt, size, &fail)) { + if (ret_limit_pool) { + *ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt); + css_get(&(*ret_limit_pool)->cs->css); + } + ret = -EAGAIN; + goto err; + } + + /* On success, reference from get_current_dmemcs is transferred to *ret_pool */ + *ret_pool = pool; + return 0; + +err: + css_put(&cg->css); + return ret; +} +EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge); + +static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v) +{ + struct dmem_cgroup_region *region; + + rcu_read_lock(); + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { + seq_puts(sf, region->name); + seq_printf(sf, " %llu\n", region->size); + } + rcu_read_unlock(); + return 0; +} + +static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region, + u64 *new_limit) +{ + char *end; + + if (!strcmp(options, "max")) { + *new_limit = PAGE_COUNTER_MAX; + return 0; + } + + *new_limit = memparse(options, &end); + if (*end != '\0') + return -EINVAL; + + return 0; +} + +static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, + void (*apply)(struct dmem_cgroup_pool_state *, u64)) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of)); + int err = 0; + + while (buf && !err) { + struct dmem_cgroup_pool_state *pool = NULL; + char *options, *region_name; + struct dmem_cgroup_region *region; + u64 new_limit; + + options = buf; + buf = strchr(buf, '\n'); + if (buf) + *buf++ = '\0'; + + options = strstrip(options); + + /* eat empty lines */ + if (!options[0]) + continue; + + region_name = strsep(&options, " \t"); + if (!region_name[0]) + continue; + + rcu_read_lock(); + region = dmemcg_get_region_by_name(region_name); + rcu_read_unlock(); + + if (!region) + return -EINVAL; + + err = dmemcg_parse_limit(options, region, &new_limit); + if (err < 0) + goto out_put; + + pool = get_cg_pool_unlocked(dmemcs, region); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + goto out_put; + } + + /* And commit */ + apply(pool, new_limit); + +out_put: + kref_put(®ion->ref, dmemcg_free_region); + } + + + return err ?: nbytes; +} + +static int dmemcg_limit_show(struct seq_file *sf, void *v, + u64 (*fn)(struct dmem_cgroup_pool_state *)) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf)); + struct dmem_cgroup_region *region; + + rcu_read_lock(); + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { + struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region); + u64 val; + + seq_puts(sf, region->name); + + val = fn(pool); + if (val < PAGE_COUNTER_MAX) + seq_printf(sf, " %lld\n", val); + else + seq_puts(sf, " max\n"); + } + rcu_read_unlock(); + + return 0; +} + +static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_current); +} + +static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_min); +} + +static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min); +} + +static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_low); +} + +static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low); +} + +static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_max); +} + +static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max); +} + +static struct cftype files[] = { + { + .name = "capacity", + .seq_show = dmem_cgroup_region_capacity_show, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + { + .name = "current", + .seq_show = dmem_cgroup_region_current_show, + }, + { + .name = "min", + .write = dmem_cgroup_region_min_write, + .seq_show = dmem_cgroup_region_min_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "low", + .write = dmem_cgroup_region_low_write, + .seq_show = dmem_cgroup_region_low_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "max", + .write = dmem_cgroup_region_max_write, + .seq_show = dmem_cgroup_region_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* Zero entry terminates. */ +}; + +struct cgroup_subsys dmem_cgrp_subsys = { + .css_alloc = dmemcs_alloc, + .css_free = dmemcs_free, + .css_offline = dmemcs_offline, + .legacy_cftypes = files, + .dfl_cftypes = files, +}; diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 074653f964c1..039d1eb2f215 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -430,9 +430,11 @@ static ssize_t freezer_write(struct kernfs_open_file *of, if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; - else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) { + pr_info_once("Freezing with imperfect legacy cgroup freezer. " + "See cgroup.freeze of cgroup v2\n"); freeze = true; - else + } else return -EINVAL; freezer_change_state(css_freezer(of_css(of)), freeze); diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 0e26068995a6..2fa3a4fb2aaf 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -68,22 +68,6 @@ static inline bool valid_type(enum misc_res_type type) } /** - * misc_cg_res_total_usage() - Get the current total usage of the resource. - * @type: misc res type. - * - * Context: Any context. - * Return: Current total usage of the resource. - */ -u64 misc_cg_res_total_usage(enum misc_res_type type) -{ - if (valid_type(type)) - return atomic64_read(&root_cg.res[type].usage); - - return 0; -} -EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); - -/** * misc_cg_set_capacity() - Set the capacity of the misc cgroup res. * @type: Type of the misc res. * @capacity: Supported capacity of the misc res on the host. diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 5877974ece92..4bb587d5d34f 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -299,40 +299,6 @@ static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) spin_unlock_irq(&cgroup_rstat_lock); } -/* see cgroup_rstat_flush() */ -static void cgroup_rstat_flush_locked(struct cgroup *cgrp) - __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) -{ - int cpu; - - lockdep_assert_held(&cgroup_rstat_lock); - - for_each_possible_cpu(cpu) { - struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); - - for (; pos; pos = pos->rstat_flush_next) { - struct cgroup_subsys_state *css; - - cgroup_base_stat_flush(pos, cpu); - bpf_rstat_flush(pos, cgroup_parent(pos), cpu); - - rcu_read_lock(); - list_for_each_entry_rcu(css, &pos->rstat_css_list, - rstat_css_node) - css->ss->css_rstat_flush(css, cpu); - rcu_read_unlock(); - } - - /* play nice and yield if necessary */ - if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - __cgroup_rstat_unlock(cgrp, cpu); - if (!cond_resched()) - cpu_relax(); - __cgroup_rstat_lock(cgrp, cpu); - } - } -} - /** * cgroup_rstat_flush - flush stats in @cgrp's subtree * @cgrp: target cgroup @@ -348,38 +314,30 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) */ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) { + int cpu; + might_sleep(); + for_each_possible_cpu(cpu) { + struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); - __cgroup_rstat_lock(cgrp, -1); - cgroup_rstat_flush_locked(cgrp); - __cgroup_rstat_unlock(cgrp, -1); -} + /* Reacquire for each CPU to avoid disabling IRQs too long */ + __cgroup_rstat_lock(cgrp, cpu); + for (; pos; pos = pos->rstat_flush_next) { + struct cgroup_subsys_state *css; -/** - * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold - * @cgrp: target cgroup - * - * Flush stats in @cgrp's subtree and prevent further flushes. Must be - * paired with cgroup_rstat_flush_release(). - * - * This function may block. - */ -void cgroup_rstat_flush_hold(struct cgroup *cgrp) - __acquires(&cgroup_rstat_lock) -{ - might_sleep(); - __cgroup_rstat_lock(cgrp, -1); - cgroup_rstat_flush_locked(cgrp); -} + cgroup_base_stat_flush(pos, cpu); + bpf_rstat_flush(pos, cgroup_parent(pos), cpu); -/** - * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() - * @cgrp: cgroup used by tracepoint - */ -void cgroup_rstat_flush_release(struct cgroup *cgrp) - __releases(&cgroup_rstat_lock) -{ - __cgroup_rstat_unlock(cgrp, -1); + rcu_read_lock(); + list_for_each_entry_rcu(css, &pos->rstat_css_list, + rstat_css_node) + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + } + __cgroup_rstat_unlock(cgrp, cpu); + if (!cond_resched()) + cpu_relax(); + } } int cgroup_rstat_init(struct cgroup *cgrp) @@ -590,7 +548,6 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat) cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; - cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; #ifdef CONFIG_SCHED_CORE bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE]; @@ -613,36 +570,34 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime, ntime; + struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { - cgroup_rstat_flush_hold(cgrp); - usage = cgrp->bstat.cputime.sum_exec_runtime; + cgroup_rstat_flush(cgrp); + __cgroup_rstat_lock(cgrp, -1); + bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, - &utime, &stime); - ntime = cgrp->bstat.ntime; - cgroup_rstat_flush_release(cgrp); + &bstat.cputime.utime, &bstat.cputime.stime); + __cgroup_rstat_unlock(cgrp, -1); } else { - /* cgrp->bstat of root is not actually used, reuse it */ - root_cgroup_cputime(&cgrp->bstat); - usage = cgrp->bstat.cputime.sum_exec_runtime; - utime = cgrp->bstat.cputime.utime; - stime = cgrp->bstat.cputime.stime; - ntime = cgrp->bstat.ntime; + root_cgroup_cputime(&bstat); } - do_div(usage, NSEC_PER_USEC); - do_div(utime, NSEC_PER_USEC); - do_div(stime, NSEC_PER_USEC); - do_div(ntime, NSEC_PER_USEC); + do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); + do_div(bstat.cputime.utime, NSEC_PER_USEC); + do_div(bstat.cputime.stime, NSEC_PER_USEC); + do_div(bstat.ntime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n" "nice_usec %llu\n", - usage, utime, stime, ntime); + bstat.cputime.sum_exec_runtime, + bstat.cputime.utime, + bstat.cputime.stime, + bstat.ntime); - cgroup_force_idle_show(seq, &cgrp->bstat); + cgroup_force_idle_show(seq, &bstat); } /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 3fabb8f55ef6..dd7c32fb5ac1 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -46,7 +46,7 @@ CONFIG_UBSAN_BOUNDS=y # CONFIG_UBSAN_SHIFT is not set # CONFIG_UBSAN_DIV_ZERO is not set # CONFIG_UBSAN_UNREACHABLE is not set -# CONFIG_UBSAN_SIGNED_WRAP is not set +# CONFIG_UBSAN_INTEGER_WRAP is not set # CONFIG_UBSAN_BOOL is not set # CONFIG_UBSAN_ENUM is not set # CONFIG_UBSAN_ALIGNMENT is not set diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 938c48952d26..fb5be6e9b423 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -80,17 +80,16 @@ static __always_inline void rcu_task_trace_heavyweight_exit(void) */ static noinstr void ct_kernel_exit_state(int offset) { - int seq; - /* * CPUs seeing atomic_add_return() must see prior RCU read-side * critical sections, and we also must force ordering with the * next idle sojourn. */ rcu_task_trace_heavyweight_enter(); // Before CT state update! - seq = ct_state_inc(offset); - // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING)); + // RCU is still watching. Better not be in extended quiescent state! + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu()); + (void)ct_state_inc(offset); + // RCU is no longer watching. } /* diff --git a/kernel/cpu.c b/kernel/cpu.c index 0509a9733745..b08bb34b1718 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -526,6 +526,7 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(&cpu_hotplug_lock); } +EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held); #ifdef CONFIG_LOCKDEP int lockdep_is_cpus_held(void) @@ -905,12 +906,13 @@ static int finish_cpu(unsigned int cpu) struct mm_struct *mm = idle->active_mm; /* - * idle_task_exit() will have switched to &init_mm, now - * clean up any remaining active_mm state. + * sched_force_init_mm() ensured the use of &init_mm, + * drop that refcount now that the CPU has stopped. */ - if (mm != &init_mm) - idle->active_mm = &init_mm; + WARN_ON(mm != &init_mm); + idle->active_mm = NULL; mmdrop_lazy_tlb(mm); + return 0; } @@ -1452,11 +1454,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, out: cpus_write_unlock(); - /* - * Do post unplug cleanup. This is still protected against - * concurrent CPU hotplug via cpu_add_remove_lock. - */ - lockup_detector_cleanup(); arch_smt_update(); return ret; } @@ -3128,11 +3125,6 @@ void init_cpu_possible(const struct cpumask *src) cpumask_copy(&__cpu_possible_mask, src); } -void init_cpu_online(const struct cpumask *src) -{ - cpumask_copy(&__cpu_online_mask, src); -} - void set_cpu_online(unsigned int cpu, bool online) { /* diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 078fe5bc5a74..335b8425dd4b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -436,7 +436,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) memset(&prstatus, 0, sizeof(prstatus)); prstatus.common.pr_pid = current->pid; elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); } diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 0a39497140bf..05b137e7dcb9 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -305,7 +305,7 @@ int kdb_putarea_size(unsigned long addr, void *res, size_t size) /* * kdb_getphys - Read data from a physical address. Validate the - * address is in range, use kmap_atomic() to get data + * address is in range, use kmap_local_page() to get data * similar to kdb_getarea() - but for phys addresses * Inputs: * res Pointer to the word to receive the result @@ -324,9 +324,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size) if (!pfn_valid(pfn)) return 1; page = pfn_to_page(pfn); - vaddr = kmap_atomic(page); + vaddr = kmap_local_page(page); memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); - kunmap_atomic(vaddr); + kunmap_local(vaddr); return 0; } @@ -536,21 +536,3 @@ bool kdb_task_state(const struct task_struct *p, const char *mask) return strchr(mask, state); } - -/* Maintain a small stack of kdb_flags to allow recursion without disturbing - * the global kdb state. - */ - -static int kdb_flags_stack[4], kdb_flags_index; - -void kdb_save_flags(void) -{ - BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack)); - kdb_flags_stack[kdb_flags_index++] = kdb_flags; -} - -void kdb_restore_flags(void) -{ - BUG_ON(kdb_flags_index <= 0); - kdb_flags = kdb_flags_stack[--kdb_flags_index]; -} diff --git a/kernel/delayacct.c b/kernel/delayacct.c index dead51de8eb5..eb63a021ac04 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -64,7 +64,7 @@ static int sysctl_delayacct(const struct ctl_table *table, int write, void *buff return err; } -static struct ctl_table kern_delayacct_table[] = { +static const struct ctl_table kern_delayacct_table[] = { { .procname = "task_delayacct", .data = NULL, @@ -93,9 +93,9 @@ void __delayacct_tsk_init(struct task_struct *tsk) /* * Finish delay accounting for a statistic using its timestamps (@start), - * accumalator (@total) and @count + * accumulator (@total) and @count */ -static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count) +static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min) { s64 ns = local_clock() - *start; unsigned long flags; @@ -104,6 +104,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou raw_spin_lock_irqsave(lock, flags); *total += ns; (*count)++; + if (ns > *max) + *max = ns; + if (*min == 0 || ns < *min) + *min = ns; raw_spin_unlock_irqrestore(lock, flags); } } @@ -122,7 +126,9 @@ void __delayacct_blkio_end(struct task_struct *p) delayacct_end(&p->delays->lock, &p->delays->blkio_start, &p->delays->blkio_delay, - &p->delays->blkio_count); + &p->delays->blkio_count, + &p->delays->blkio_delay_max, + &p->delays->blkio_delay_min); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -153,10 +159,12 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->cpu_count += t1; + d->cpu_delay_max = tsk->sched_info.max_run_delay; + d->cpu_delay_min = tsk->sched_info.min_run_delay; tmp = (s64)d->cpu_delay_total + t2; d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; - tmp = (s64)d->cpu_run_virtual_total + t3; + d->cpu_run_virtual_total = (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; @@ -164,20 +172,33 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) return 0; /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ - raw_spin_lock_irqsave(&tsk->delays->lock, flags); + d->blkio_delay_max = tsk->delays->blkio_delay_max; + d->blkio_delay_min = tsk->delays->blkio_delay_min; tmp = d->blkio_delay_total + tsk->delays->blkio_delay; d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; + d->swapin_delay_max = tsk->delays->swapin_delay_max; + d->swapin_delay_min = tsk->delays->swapin_delay_min; tmp = d->swapin_delay_total + tsk->delays->swapin_delay; d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; + d->freepages_delay_max = tsk->delays->freepages_delay_max; + d->freepages_delay_min = tsk->delays->freepages_delay_min; tmp = d->freepages_delay_total + tsk->delays->freepages_delay; d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; + d->thrashing_delay_max = tsk->delays->thrashing_delay_max; + d->thrashing_delay_min = tsk->delays->thrashing_delay_min; tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; + d->compact_delay_max = tsk->delays->compact_delay_max; + d->compact_delay_min = tsk->delays->compact_delay_min; tmp = d->compact_delay_total + tsk->delays->compact_delay; d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; + d->wpcopy_delay_max = tsk->delays->wpcopy_delay_max; + d->wpcopy_delay_min = tsk->delays->wpcopy_delay_min; tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; + d->irq_delay_max = tsk->delays->irq_delay_max; + d->irq_delay_min = tsk->delays->irq_delay_min; tmp = d->irq_delay_total + tsk->delays->irq_delay; d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; @@ -213,7 +234,9 @@ void __delayacct_freepages_end(void) delayacct_end(¤t->delays->lock, ¤t->delays->freepages_start, ¤t->delays->freepages_delay, - ¤t->delays->freepages_count); + ¤t->delays->freepages_count, + ¤t->delays->freepages_delay_max, + ¤t->delays->freepages_delay_min); } void __delayacct_thrashing_start(bool *in_thrashing) @@ -235,7 +258,9 @@ void __delayacct_thrashing_end(bool *in_thrashing) delayacct_end(¤t->delays->lock, ¤t->delays->thrashing_start, ¤t->delays->thrashing_delay, - ¤t->delays->thrashing_count); + ¤t->delays->thrashing_count, + ¤t->delays->thrashing_delay_max, + ¤t->delays->thrashing_delay_min); } void __delayacct_swapin_start(void) @@ -248,7 +273,9 @@ void __delayacct_swapin_end(void) delayacct_end(¤t->delays->lock, ¤t->delays->swapin_start, ¤t->delays->swapin_delay, - ¤t->delays->swapin_count); + ¤t->delays->swapin_count, + ¤t->delays->swapin_delay_max, + ¤t->delays->swapin_delay_min); } void __delayacct_compact_start(void) @@ -261,7 +288,9 @@ void __delayacct_compact_end(void) delayacct_end(¤t->delays->lock, ¤t->delays->compact_start, ¤t->delays->compact_delay, - ¤t->delays->compact_count); + ¤t->delays->compact_count, + ¤t->delays->compact_delay_max, + ¤t->delays->compact_delay_min); } void __delayacct_wpcopy_start(void) @@ -274,7 +303,9 @@ void __delayacct_wpcopy_end(void) delayacct_end(¤t->delays->lock, ¤t->delays->wpcopy_start, ¤t->delays->wpcopy_delay, - ¤t->delays->wpcopy_count); + ¤t->delays->wpcopy_count, + ¤t->delays->wpcopy_delay_max, + ¤t->delays->wpcopy_delay_min); } void __delayacct_irq(struct task_struct *task, u32 delta) @@ -284,6 +315,10 @@ void __delayacct_irq(struct task_struct *task, u32 delta) raw_spin_lock_irqsave(&task->delays->lock, flags); task->delays->irq_delay += delta; task->delays->irq_count++; + if (delta > task->delays->irq_delay_max) + task->delays->irq_delay_max = delta; + if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min)) + task->delays->irq_delay_min = delta; raw_spin_unlock_irqrestore(&task->delays->lock, flags); } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 5b4e6d3bf7bc..b8fe0b3d0ffb 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -584,6 +584,22 @@ int dma_direct_supported(struct device *dev, u64 mask) return mask >= phys_to_dma_unencrypted(dev, min_mask); } +static const struct bus_dma_region *dma_find_range(struct device *dev, + unsigned long start_pfn) +{ + const struct bus_dma_region *m; + + for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) { + unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start); + + if (start_pfn >= cpu_start_pfn && + start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) + return m; + } + + return NULL; +} + /* * To check whether all ram resource ranges are covered by dma range map * Returns 0 when further check is needed @@ -593,20 +609,12 @@ static int check_ram_in_range_map(unsigned long start_pfn, unsigned long nr_pages, void *data) { unsigned long end_pfn = start_pfn + nr_pages; - const struct bus_dma_region *bdr = NULL; - const struct bus_dma_region *m; struct device *dev = data; while (start_pfn < end_pfn) { - for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) { - unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start); + const struct bus_dma_region *bdr; - if (start_pfn >= cpu_start_pfn && - start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) { - bdr = m; - break; - } - } + bdr = dma_find_range(dev, start_pfn); if (!bdr) return 1; diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 095c775e001e..d4b8bd0af79b 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -6,6 +6,9 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n +# Branch profiling isn't noinstr-safe +ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING + CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector diff --git a/kernel/entry/common.c b/kernel/entry/common.c index e33691d5adf7..20154572ede9 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -49,7 +49,7 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Do seccomp after ptrace, to catch any tracer changes. */ if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(NULL); + ret = __secure_computing(); if (ret == -1L) return ret; } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 8a47e52a454f..6c83ad674d01 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -22,6 +22,7 @@ struct callchain_cpus_entries { int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; +static const int six_hundred_forty_kb = 640 * 1024; static inline size_t perf_callchain_entry__sizeof(void) { @@ -266,12 +267,8 @@ exit_put: return entry; } -/* - * Used for sysctl_perf_event_max_stack and - * sysctl_perf_event_max_contexts_per_stack. - */ -int perf_event_max_stack_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int perf_event_max_stack_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; @@ -292,3 +289,32 @@ int perf_event_max_stack_handler(const struct ctl_table *table, int write, return ret; } + +static const struct ctl_table callchain_sysctl_table[] = { + { + .procname = "perf_event_max_stack", + .data = &sysctl_perf_event_max_stack, + .maxlen = sizeof(sysctl_perf_event_max_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&six_hundred_forty_kb, + }, + { + .procname = "perf_event_max_contexts_per_stack", + .data = &sysctl_perf_event_max_contexts_per_stack, + .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), + .mode = 0644, + .proc_handler = perf_event_max_stack_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_THOUSAND, + }, +}; + +static int __init init_callchain_sysctls(void) +{ + register_sysctl_init("kernel", callchain_sysctl_table); + return 0; +} +core_initcall(init_callchain_sysctls); + diff --git a/kernel/events/core.c b/kernel/events/core.c index 065f9188b44a..0bb21659e252 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -55,6 +55,7 @@ #include <linux/pgtable.h> #include <linux/buildid.h> #include <linux/task_work.h> +#include <linux/percpu-rwsem.h> #include "internal.h" @@ -452,8 +453,8 @@ static struct kmem_cache *perf_event_cache; */ int sysctl_perf_event_paranoid __read_mostly = 2; -/* Minimum for 512 kiB + 1 user control page */ -int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ +/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ +static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* * max perf event sample rate @@ -463,6 +464,7 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; @@ -484,7 +486,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); -int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, +static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -506,9 +508,7 @@ int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, return 0; } -int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; - -int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, +static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -528,6 +528,52 @@ int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, return 0; } +static const struct ctl_table events_core_sysctl_table[] = { + /* + * User-space relies on this file as a feature check for + * perf_events being enabled. It's an ABI, do not remove! + */ + { + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), + .mode = 0644, + .proc_handler = perf_event_max_sample_rate_handler, + .extra1 = SYSCTL_ONE, + }, + { + .procname = "perf_cpu_time_max_percent", + .data = &sysctl_perf_cpu_time_max_percent, + .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), + .mode = 0644, + .proc_handler = perf_cpu_time_max_percent_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, +}; + +static int __init init_events_core_sysctls(void) +{ + register_sysctl_init("kernel", events_core_sysctl_table); + return 0; +} +core_initcall(init_events_core_sysctls); + + /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not @@ -1147,8 +1193,8 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpc->hrtimer_lock); - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - timer->function = perf_mux_hrtimer_handler; + hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_HARD); } static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) @@ -1172,42 +1218,40 @@ static int perf_mux_hrtimer_restart_ipi(void *arg) return perf_mux_hrtimer_restart(arg); } +static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu) +{ + return *this_cpu_ptr(pmu->cpu_pmu_context); +} + void perf_pmu_disable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpc(pmu)->pmu_disable_count; if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { - int *count = this_cpu_ptr(pmu->pmu_disable_count); + int *count = &this_cpc(pmu)->pmu_disable_count; if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { - WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); + int *count = &this_cpc(pmu)->pmu_disable_count; + WARN_ON_ONCE(*count == 0); } -static void get_ctx(struct perf_event_context *ctx) +static inline void perf_pmu_read(struct perf_event *event) { - refcount_inc(&ctx->refcount); -} - -static void *alloc_task_ctx_data(struct pmu *pmu) -{ - if (pmu->task_ctx_cache) - return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL); - - return NULL; + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); } -static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data) +static void get_ctx(struct perf_event_context *ctx) { - if (pmu->task_ctx_cache && task_ctx_data) - kmem_cache_free(pmu->task_ctx_cache, task_ctx_data); + refcount_inc(&ctx->refcount); } static void free_ctx(struct rcu_head *head) @@ -2303,7 +2347,7 @@ static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; // XXX cpc serialization, probably per-cpu IRQ disabled @@ -2444,9 +2488,8 @@ __perf_remove_from_context(struct perf_event *event, pmu_ctx->rotate_necessary = 0; if (ctx->task && ctx->is_active) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu); - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } @@ -2584,7 +2627,7 @@ static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); @@ -2691,7 +2734,7 @@ error: static int group_can_go_on(struct perf_event *event, int can_add_hw) { struct perf_event_pmu_context *epc = event->pmu_ctx; - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); /* * Groups consisting entirely of software events can always go on. @@ -3314,9 +3357,8 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, struct pmu *pmu = pmu_ctx->pmu; if (ctx->task && !(ctx->is_active & EVENT_ALL)) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); - cpc = this_cpu_ptr(pmu->cpu_pmu_context); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } @@ -3473,8 +3515,7 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + perf_pmu_read(event); perf_event_update_time(event); @@ -3522,52 +3563,17 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \ - for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ - pos2 = list_first_entry(head2, typeof(*pos2), member); \ - !list_entry_is_head(pos1, head1, member) && \ - !list_entry_is_head(pos2, head2, member); \ - pos1 = list_next_entry(pos1, member), \ - pos2 = list_next_entry(pos2, member)) - -static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, - struct perf_event_context *next_ctx) -{ - struct perf_event_pmu_context *prev_epc, *next_epc; - - if (!prev_ctx->nr_task_data) - return; - - double_list_for_each_entry(prev_epc, next_epc, - &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, - pmu_ctx_entry) { - - if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu)) - continue; - - /* - * PMU specific parts of task perf context can require - * additional synchronization. As an example of such - * synchronization see implementation details of Intel - * LBR call stack data profiling; - */ - if (prev_epc->pmu->swap_task_ctx) - prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); - else - swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); - } -} - -static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in) +static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, + struct task_struct *task, bool sched_in) { struct perf_event_pmu_context *pmu_ctx; struct perf_cpu_pmu_context *cpc; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + cpc = this_cpc(pmu_ctx->pmu); if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) - pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); + pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in); } } @@ -3630,17 +3636,16 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - perf_ctx_sched_task_cb(ctx, false); - perf_event_swap_task_ctx_data(ctx, next_ctx); + perf_ctx_sched_task_cb(ctx, task, false); perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of - * ctx->task and ctx->task_ctx_data are immaterial - * since those values are always verified under - * ctx->lock which we're now holding. + * ctx->task is immaterial since this value is + * always verified under ctx->lock which we're now + * holding. */ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); RCU_INIT_POINTER(next->perf_event_ctxp, ctx); @@ -3660,7 +3665,7 @@ unlock: perf_ctx_disable(ctx, false); inside_switch: - perf_ctx_sched_task_cb(ctx, false); + perf_ctx_sched_task_cb(ctx, task, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); perf_ctx_enable(ctx, false); @@ -3673,7 +3678,7 @@ static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); this_cpu_dec(perf_sched_cb_usages); barrier(); @@ -3685,7 +3690,7 @@ void perf_sched_cb_dec(struct pmu *pmu) void perf_sched_cb_inc(struct pmu *pmu) { - struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); + struct perf_cpu_pmu_context *cpc = this_cpc(pmu); if (!cpc->sched_cb_usage++) list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); @@ -3702,7 +3707,8 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ -static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in) +static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, + struct task_struct *task, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; @@ -3716,7 +3722,7 @@ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_i perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); - pmu->sched_task(cpc->task_epc, sched_in); + pmu->sched_task(cpc->task_epc, task, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -3734,7 +3740,7 @@ static void perf_pmu_sched_task(struct task_struct *prev, return; list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) - __perf_pmu_sched_task(cpc, sched_in); + __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in); } static void perf_event_switch(struct task_struct *task, @@ -3802,7 +3808,7 @@ static void __link_epc(struct perf_event_pmu_context *pmu_ctx) if (!pmu_ctx->ctx->task) return; - cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + cpc = this_cpc(pmu_ctx->pmu); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = pmu_ctx; } @@ -3930,11 +3936,15 @@ static int merge_sched_in(struct perf_event *event, void *data) if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + + if (*perf_event_fasync(event)) + event->pending_kill = POLL_HUP; + + perf_event_wakeup(event); } else { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); event->pmu_ctx->rotate_necessary = 1; - cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } @@ -4029,7 +4039,7 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_ctx_lock(cpuctx, ctx); perf_ctx_disable(ctx, false); - perf_ctx_sched_task_cb(ctx, true); + perf_ctx_sched_task_cb(ctx, task, true); perf_ctx_enable(ctx, false); perf_ctx_unlock(cpuctx, ctx); @@ -4060,7 +4070,7 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_event_sched_in(cpuctx, ctx, NULL); - perf_ctx_sched_task_cb(cpuctx->task_ctx, true); + perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) perf_ctx_enable(&cpuctx->ctx, false); @@ -4618,15 +4628,8 @@ static void __perf_event_read(void *info) pmu->read(event); - for_each_sibling_event(sub, event) { - if (sub->state == PERF_EVENT_STATE_ACTIVE) { - /* - * Use sibling's PMU rather than @event's since - * sibling could be on different (eg: software) PMU. - */ - sub->pmu->read(sub); - } - } + for_each_sibling_event(sub, event) + perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); @@ -4883,7 +4886,7 @@ find_get_context(struct task_struct *task, struct perf_event *event) if (!task) { /* Must be root to operate on a CPU event: */ - err = perf_allow_cpu(&event->attr); + err = perf_allow_cpu(); if (err) return ERR_PTR(err); @@ -4950,8 +4953,7 @@ static struct perf_event_pmu_context * find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, struct perf_event *event) { - struct perf_event_pmu_context *new = NULL, *epc; - void *task_ctx_data = NULL; + struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc; if (!ctx->task) { /* @@ -4961,11 +4963,14 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, */ struct perf_cpu_pmu_context *cpc; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { - atomic_set(&epc->refcount, 1); + /* + * One extra reference for the pmu; see perf_pmu_free(). + */ + atomic_set(&epc->refcount, 2); epc->embedded = 1; list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; @@ -4981,14 +4986,6 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, if (!new) return ERR_PTR(-ENOMEM); - if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = alloc_task_ctx_data(pmu); - if (!task_ctx_data) { - kfree(new); - return ERR_PTR(-ENOMEM); - } - } - __perf_init_event_pmu_context(new, pmu); /* @@ -5007,23 +5004,23 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, atomic_inc(&epc->refcount); goto found_epc; } + /* Make sure the pmu_ctx_list is sorted by PMU type: */ + if (!pos && epc->pmu->type > pmu->type) + pos = epc; } epc = new; new = NULL; - list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + if (!pos) + list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + else + list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev); + epc->ctx = ctx; found_epc: - if (task_ctx_data && !epc->task_ctx_data) { - epc->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - ctx->nr_task_data++; - } raw_spin_unlock_irq(&ctx->lock); - - free_task_ctx_data(pmu, task_ctx_data); kfree(new); return epc; @@ -5034,11 +5031,18 @@ static void get_pmu_ctx(struct perf_event_pmu_context *epc) WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); } +static void free_cpc_rcu(struct rcu_head *head) +{ + struct perf_cpu_pmu_context *cpc = + container_of(head, typeof(*cpc), epc.rcu_head); + + kfree(cpc); +} + static void free_epc_rcu(struct rcu_head *head) { struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); - kfree(epc->task_ctx_data); kfree(epc); } @@ -5068,8 +5072,10 @@ static void put_pmu_ctx(struct perf_event_pmu_context *epc) raw_spin_unlock_irqrestore(&ctx->lock, flags); - if (epc->embedded) + if (epc->embedded) { + call_rcu(&epc->rcu_head, free_cpc_rcu); return; + } call_rcu(&epc->rcu_head, free_epc_rcu); } @@ -5145,6 +5151,225 @@ static void unaccount_freq_event(void) atomic_dec(&nr_freq_events); } + +static struct perf_ctx_data * +alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global) +{ + struct perf_ctx_data *cd; + + cd = kzalloc(sizeof(*cd), GFP_KERNEL); + if (!cd) + return NULL; + + cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL); + if (!cd->data) { + kfree(cd); + return NULL; + } + + cd->global = global; + cd->ctx_cache = ctx_cache; + refcount_set(&cd->refcount, 1); + + return cd; +} + +static void free_perf_ctx_data(struct perf_ctx_data *cd) +{ + kmem_cache_free(cd->ctx_cache, cd->data); + kfree(cd); +} + +static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head) +{ + struct perf_ctx_data *cd; + + cd = container_of(rcu_head, struct perf_ctx_data, rcu_head); + free_perf_ctx_data(cd); +} + +static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd) +{ + call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu); +} + +static int +attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache, + bool global) +{ + struct perf_ctx_data *cd, *old = NULL; + + cd = alloc_perf_ctx_data(ctx_cache, global); + if (!cd) + return -ENOMEM; + + for (;;) { + if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) { + if (old) + perf_free_ctx_data_rcu(old); + return 0; + } + + if (!old) { + /* + * After seeing a dead @old, we raced with + * removal and lost, try again to install @cd. + */ + continue; + } + + if (refcount_inc_not_zero(&old->refcount)) { + free_perf_ctx_data(cd); /* unused */ + return 0; + } + + /* + * @old is a dead object, refcount==0 is stable, try and + * replace it with @cd. + */ + } + return 0; +} + +static void __detach_global_ctx_data(void); +DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem); +static refcount_t global_ctx_data_ref; + +static int +attach_global_ctx_data(struct kmem_cache *ctx_cache) +{ + struct task_struct *g, *p; + struct perf_ctx_data *cd; + int ret; + + if (refcount_inc_not_zero(&global_ctx_data_ref)) + return 0; + + guard(percpu_write)(&global_ctx_data_rwsem); + if (refcount_inc_not_zero(&global_ctx_data_ref)) + return 0; +again: + /* Allocate everything */ + scoped_guard (rcu) { + for_each_process_thread(g, p) { + cd = rcu_dereference(p->perf_ctx_data); + if (cd && !cd->global) { + cd->global = 1; + if (!refcount_inc_not_zero(&cd->refcount)) + cd = NULL; + } + if (!cd) { + get_task_struct(p); + goto alloc; + } + } + } + + refcount_set(&global_ctx_data_ref, 1); + + return 0; +alloc: + ret = attach_task_ctx_data(p, ctx_cache, true); + put_task_struct(p); + if (ret) { + __detach_global_ctx_data(); + return ret; + } + goto again; +} + +static int +attach_perf_ctx_data(struct perf_event *event) +{ + struct task_struct *task = event->hw.target; + struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache; + int ret; + + if (!ctx_cache) + return -ENOMEM; + + if (task) + return attach_task_ctx_data(task, ctx_cache, false); + + ret = attach_global_ctx_data(ctx_cache); + if (ret) + return ret; + + event->attach_state |= PERF_ATTACH_GLOBAL_DATA; + return 0; +} + +static void +detach_task_ctx_data(struct task_struct *p) +{ + struct perf_ctx_data *cd; + + scoped_guard (rcu) { + cd = rcu_dereference(p->perf_ctx_data); + if (!cd || !refcount_dec_and_test(&cd->refcount)) + return; + } + + /* + * The old ctx_data may be lost because of the race. + * Nothing is required to do for the case. + * See attach_task_ctx_data(). + */ + if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL)) + perf_free_ctx_data_rcu(cd); +} + +static void __detach_global_ctx_data(void) +{ + struct task_struct *g, *p; + struct perf_ctx_data *cd; + +again: + scoped_guard (rcu) { + for_each_process_thread(g, p) { + cd = rcu_dereference(p->perf_ctx_data); + if (!cd || !cd->global) + continue; + cd->global = 0; + get_task_struct(p); + goto detach; + } + } + return; +detach: + detach_task_ctx_data(p); + put_task_struct(p); + goto again; +} + +static void detach_global_ctx_data(void) +{ + if (refcount_dec_not_one(&global_ctx_data_ref)) + return; + + guard(percpu_write)(&global_ctx_data_rwsem); + if (!refcount_dec_and_test(&global_ctx_data_ref)) + return; + + /* remove everything */ + __detach_global_ctx_data(); +} + +static void detach_perf_ctx_data(struct perf_event *event) +{ + struct task_struct *task = event->hw.target; + + event->attach_state &= ~PERF_ATTACH_TASK_DATA; + + if (task) + return detach_task_ctx_data(task); + + if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) { + detach_global_ctx_data(); + event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA; + } +} + static void unaccount_event(struct perf_event *event) { bool dec = false; @@ -5239,6 +5464,8 @@ static int exclusive_event_init(struct perf_event *event) return -EBUSY; } + event->attach_state |= PERF_ATTACH_EXCLUSIVE; + return 0; } @@ -5246,14 +5473,13 @@ static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!is_exclusive_pmu(pmu)) - return; - /* see comment in exclusive_event_init() */ if (event->attach_state & PERF_ATTACH_TASK) atomic_dec(&pmu->exclusive_cnt); else atomic_inc(&pmu->exclusive_cnt); + + event->attach_state &= ~PERF_ATTACH_EXCLUSIVE; } static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) @@ -5285,8 +5511,7 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void perf_addr_filters_splice(struct perf_event *event, - struct list_head *head); +static void perf_free_addr_filters(struct perf_event *event); static void perf_pending_task_sync(struct perf_event *event) { @@ -5312,39 +5537,22 @@ static void perf_pending_task_sync(struct perf_event *event) rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE); } -static void _free_event(struct perf_event *event) +/* vs perf_event_alloc() error */ +static void __free_event(struct perf_event *event) { - irq_work_sync(&event->pending_irq); - irq_work_sync(&event->pending_disable_irq); - perf_pending_task_sync(event); + if (event->attach_state & PERF_ATTACH_CALLCHAIN) + put_callchain_buffers(); - unaccount_event(event); + kfree(event->addr_filter_ranges); - security_perf_event_free(event); - - if (event->rb) { - /* - * Can happen when we close an event with re-directed output. - * - * Since we have a 0 refcount, perf_mmap_close() will skip - * over us; possibly making our ring_buffer_put() the last. - */ - mutex_lock(&event->mmap_mutex); - ring_buffer_attach(event, NULL); - mutex_unlock(&event->mmap_mutex); - } + if (event->attach_state & PERF_ATTACH_EXCLUSIVE) + exclusive_event_destroy(event); if (is_cgroup_event(event)) perf_detach_cgroup(event); - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - - perf_event_free_bpf_prog(event); - perf_addr_filters_splice(event, NULL); - kfree(event->addr_filter_ranges); + if (event->attach_state & PERF_ATTACH_TASK_DATA) + detach_perf_ctx_data(event); if (event->destroy) event->destroy(event); @@ -5356,22 +5564,60 @@ static void _free_event(struct perf_event *event) if (event->hw.target) put_task_struct(event->hw.target); - if (event->pmu_ctx) + if (event->pmu_ctx) { + /* + * put_pmu_ctx() needs an event->ctx reference, because of + * epc->ctx. + */ + WARN_ON_ONCE(!event->ctx); + WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx); put_pmu_ctx(event->pmu_ctx); + } /* - * perf_event_free_task() relies on put_ctx() being 'last', in particular - * all task references must be cleaned up. + * perf_event_free_task() relies on put_ctx() being 'last', in + * particular all task references must be cleaned up. */ if (event->ctx) put_ctx(event->ctx); - exclusive_event_destroy(event); - module_put(event->pmu->module); + if (event->pmu) + module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } +DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) + +/* vs perf_event_alloc() success */ +static void _free_event(struct perf_event *event) +{ + irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); + perf_pending_task_sync(event); + + unaccount_event(event); + + security_perf_event_free(event); + + if (event->rb) { + /* + * Can happen when we close an event with re-directed output. + * + * Since we have a 0 refcount, perf_mmap_close() will skip + * over us; possibly making our ring_buffer_put() the last. + */ + mutex_lock(&event->mmap_mutex); + ring_buffer_attach(event, NULL); + mutex_unlock(&event->mmap_mutex); + } + + perf_event_free_bpf_prog(event); + perf_free_addr_filters(event); + + __free_event(event); +} + /* * Used to free events which have a known refcount of 1, such as in error paths * where the event isn't exposed yet and inherited events. @@ -5840,6 +6086,10 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) if (is_event_hup(event)) return events; + if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR && + event->attr.pinned)) + return events; + /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. @@ -5962,14 +6212,15 @@ static int _perf_event_period(struct perf_event *event, u64 value) if (!value) return -EINVAL; - if (event->attr.freq && value > sysctl_perf_event_sample_rate) - return -EINVAL; - - if (perf_event_check_period(event, value)) - return -EINVAL; - - if (!event->attr.freq && (value & (1ULL << 63))) - return -EINVAL; + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) + return -EINVAL; + } else { + if (perf_event_check_period(event, value)) + return -EINVAL; + if (value & (1ULL << 63)) + return -EINVAL; + } event_function_call(event, __perf_event_period, &value); @@ -6277,41 +6528,6 @@ unlock: } EXPORT_SYMBOL_GPL(perf_event_update_userpage); -static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) -{ - struct perf_event *event = vmf->vma->vm_file->private_data; - struct perf_buffer *rb; - vm_fault_t ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) - goto unlock; - - vmf->page = perf_mmap_to_page(rb, vmf->pgoff); - if (!vmf->page) - goto unlock; - - get_page(vmf->page); - vmf->page->mapping = vmf->vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb) { @@ -6551,13 +6767,87 @@ out_put: ring_buffer_put(rb); /* could be last */ } +static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) +{ + /* The first page is the user control page, others are read-only. */ + return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; +} + static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, + .pfn_mkwrite = perf_mmap_pfn_mkwrite, }; +static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) +{ + unsigned long nr_pages = vma_pages(vma); + int err = 0; + unsigned long pagenum; + + /* + * We map this as a VM_PFNMAP VMA. + * + * This is not ideal as this is designed broadly for mappings of PFNs + * referencing memory-mapped I/O ranges or non-system RAM i.e. for which + * !pfn_valid(pfn). + * + * We are mapping kernel-allocated memory (memory we manage ourselves) + * which would more ideally be mapped using vm_insert_page() or a + * similar mechanism, that is as a VM_MIXEDMAP mapping. + * + * However this won't work here, because: + * + * 1. It uses vma->vm_page_prot, but this field has not been completely + * setup at the point of the f_op->mmp() hook, so we are unable to + * indicate that this should be mapped CoW in order that the + * mkwrite() hook can be invoked to make the first page R/W and the + * rest R/O as desired. + * + * 2. Anything other than a VM_PFNMAP of valid PFNs will result in + * vm_normal_page() returning a struct page * pointer, which means + * vm_ops->page_mkwrite() will be invoked rather than + * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping + * to work around retry logic in the fault handler, however this + * field is no longer allowed to be used within struct page. + * + * 3. Having a struct page * made available in the fault logic also + * means that the page gets put on the rmap and becomes + * inappropriately accessible and subject to map and ref counting. + * + * Ideally we would have a mechanism that could explicitly express our + * desires, but this is not currently the case, so we instead use + * VM_PFNMAP. + * + * We manage the lifetime of these mappings with internal refcounts (see + * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of + * this mapping is maintained correctly. + */ + for (pagenum = 0; pagenum < nr_pages; pagenum++) { + unsigned long va = vma->vm_start + PAGE_SIZE * pagenum; + struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum); + + if (page == NULL) { + err = -EINVAL; + break; + } + + /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */ + err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE, + vm_get_page_prot(vma->vm_flags & ~VM_SHARED)); + if (err) + break; + } + +#ifdef CONFIG_MMU + /* Clear any partial mappings on error. */ + if (err) + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); +#endif + + return err; +} + static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; @@ -6569,7 +6859,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; - int ret = 0, flags = 0; + int ret, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would @@ -6587,9 +6877,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return ret; vma_size = vma->vm_end - vma->vm_start; + nr_pages = vma_size / PAGE_SIZE; + + if (nr_pages > INT_MAX) + return -ENOMEM; + + if (vma_size != PAGE_SIZE * nr_pages) + return -EINVAL; + + user_extra = nr_pages; + + mutex_lock(&event->mmap_mutex); + ret = -EINVAL; if (vma->vm_pgoff == 0) { - nr_pages = (vma_size / PAGE_SIZE) - 1; + nr_pages -= 1; + + /* + * If we have rb pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + goto unlock; + + WARN_ON_ONCE(event->ctx->parent_ctx); + + if (event->rb) { + if (data_page_nr(event->rb) != nr_pages) + goto unlock; + + if (atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Success -- managed to mmap() the same buffer + * multiple times. + */ + ret = 0; + /* We need the rb to map pages. */ + rb = event->rb; + goto unlock; + } + + /* + * Raced against perf_mmap_close()'s + * atomic_dec_and_mutex_lock() remove the + * event and continue as if !event->rb + */ + ring_buffer_attach(event, NULL); + } + } else { /* * AUX area mapping: if rb->aux_nr_pages != 0, it's already @@ -6598,16 +6933,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) */ u64 aux_offset, aux_size; - if (!event->rb) - return -EINVAL; - - nr_pages = vma_size / PAGE_SIZE; - if (nr_pages > INT_MAX) - return -ENOMEM; - - mutex_lock(&event->mmap_mutex); - ret = -EINVAL; - rb = event->rb; if (!rb) goto aux_unlock; @@ -6648,46 +6973,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } atomic_set(&rb->aux_mmap_count, 1); - user_extra = nr_pages; - - goto accounting; } - /* - * If we have rb pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - WARN_ON_ONCE(event->ctx->parent_ctx); -again: - mutex_lock(&event->mmap_mutex); - if (event->rb) { - if (data_page_nr(event->rb) != nr_pages) { - ret = -EINVAL; - goto unlock; - } - - if (!atomic_inc_not_zero(&event->rb->mmap_count)) { - /* - * Raced against perf_mmap_close(); remove the - * event and try again. - */ - ring_buffer_attach(event, NULL); - mutex_unlock(&event->mmap_mutex); - goto again; - } - - goto unlock; - } - - user_extra = nr_pages + 1; - -accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* @@ -6755,6 +7042,8 @@ accounting: rb->aux_mmap_locked = extra; } + ret = 0; + unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); @@ -6776,7 +7065,10 @@ aux_unlock: vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; - if (event->pmu->event_mapped) + if (!ret) + ret = map_range(rb, vma); + + if (!ret && event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); return ret; @@ -7400,9 +7692,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if ((leader != event) && - (leader->state == PERF_EVENT_STATE_ACTIVE)) - leader->pmu->read(leader); + if ((leader != event) && !handle->skip_read) + perf_pmu_read(leader); values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) @@ -7415,9 +7706,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, for_each_sibling_event(sub, leader) { n = 0; - if ((sub != event) && - (sub->state == PERF_EVENT_STATE_ACTIVE)) - sub->pmu->read(sub); + if ((sub != event) && !handle->skip_read) + perf_pmu_read(sub); values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) @@ -7476,6 +7766,9 @@ void perf_output_sample(struct perf_output_handle *handle, { u64 sample_type = data->type; + if (data->sample_flags & PERF_SAMPLE_READ) + handle->skip_read = 1; + perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) @@ -8277,7 +8570,8 @@ void perf_event_exec(void) perf_event_enable_on_exec(ctx); perf_event_remove_on_exec(ctx); - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); + scoped_guard(rcu) + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); perf_unpin_context(ctx); put_ctx(ctx); @@ -8469,10 +8763,58 @@ static void perf_event_task(struct task_struct *task, task_ctx); } +/* + * Allocate data for a new task when profiling system-wide + * events which require PMU specific data + */ +static void +perf_event_alloc_task_data(struct task_struct *child, + struct task_struct *parent) +{ + struct kmem_cache *ctx_cache = NULL; + struct perf_ctx_data *cd; + + if (!refcount_read(&global_ctx_data_ref)) + return; + + scoped_guard (rcu) { + cd = rcu_dereference(parent->perf_ctx_data); + if (cd) + ctx_cache = cd->ctx_cache; + } + + if (!ctx_cache) + return; + + guard(percpu_read)(&global_ctx_data_rwsem); + scoped_guard (rcu) { + cd = rcu_dereference(child->perf_ctx_data); + if (!cd) { + /* + * A system-wide event may be unaccount, + * when attaching the perf_ctx_data. + */ + if (!refcount_read(&global_ctx_data_ref)) + return; + goto attach; + } + + if (!cd->global) { + cd->global = 1; + refcount_inc(&cd->refcount); + } + } + + return; +attach: + attach_task_ctx_data(child, ctx_cache, true); +} + void perf_event_fork(struct task_struct *task) { perf_event_task(task, NULL, 1); perf_event_namespaces(task); + perf_event_alloc_task_data(task, current); } /* @@ -8536,7 +8878,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) unsigned int size; memset(comm, 0, sizeof(comm)); - strscpy(comm, comm_event->task->comm, sizeof(comm)); + strscpy(comm, comm_event->task->comm); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -8980,7 +9322,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } cpy_name: - strscpy(tmp, name, sizeof(tmp)); + strscpy(tmp, name); name = tmp; got_name: /* @@ -9404,7 +9746,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; - strscpy(name, sym, KSYM_NAME_LEN); + strscpy(name, sym); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; @@ -10039,8 +10381,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, data, regs); } -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) +int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; @@ -10425,9 +10766,9 @@ static struct pmu perf_tracepoint = { }; static int perf_tp_filter_match(struct perf_event *event, - struct perf_sample_data *data) + struct perf_raw_record *raw) { - void *record = data->raw->frag.data; + void *record = raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -10439,7 +10780,7 @@ static int perf_tp_filter_match(struct perf_event *event, } static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data, + struct perf_raw_record *raw, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) @@ -10450,7 +10791,7 @@ static int perf_tp_event_match(struct perf_event *event, if (event->attr.exclude_kernel && !user_mode(regs)) return 0; - if (!perf_tp_filter_match(event, data)) + if (!perf_tp_filter_match(event, raw)) return 0; return 1; @@ -10476,6 +10817,7 @@ EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); static void __perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, + struct perf_raw_record *raw, struct perf_event *event) { struct trace_entry *entry = record; @@ -10485,13 +10827,17 @@ static void __perf_tp_event_target_task(u64 count, void *record, /* Cannot deliver synchronous signal to other task. */ if (event->attr.sigtrap) return; - if (perf_tp_event_match(event, data, regs)) + if (perf_tp_event_match(event, raw, regs)) { + perf_sample_data_init(data, 0, 0); + perf_sample_save_raw_data(data, event, raw); perf_swevent_event(event, count, data, regs); + } } static void perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, + struct perf_raw_record *raw, struct perf_event_context *ctx) { unsigned int cpu = smp_processor_id(); @@ -10499,15 +10845,15 @@ static void perf_tp_event_target_task(u64 count, void *record, struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { - __perf_tp_event_target_task(count, record, regs, data, event); + __perf_tp_event_target_task(count, record, regs, data, raw, event); for_each_sibling_event(sibling, event) - __perf_tp_event_target_task(count, record, regs, data, sibling); + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); } perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { - __perf_tp_event_target_task(count, record, regs, data, event); + __perf_tp_event_target_task(count, record, regs, data, raw, event); for_each_sibling_event(sibling, event) - __perf_tp_event_target_task(count, record, regs, data, sibling); + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); } } @@ -10525,15 +10871,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, }, }; - perf_sample_data_init(&data, 0, 0); - perf_sample_save_raw_data(&data, &raw); - perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) { - perf_swevent_event(event, count, &data, regs); - + if (perf_tp_event_match(event, &raw, regs)) { /* * Here use the same on-stack perf_sample_data, * some members in data are event-specific and @@ -10543,7 +10884,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, * because data->sample_flags is set. */ perf_sample_data_init(&data, 0, 0); - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); + perf_swevent_event(event, count, &data, regs); } } @@ -10560,7 +10902,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, goto unlock; raw_spin_lock(&ctx->lock); - perf_tp_event_target_task(count, record, regs, &data, ctx); + perf_tp_event_target_task(count, record, regs, &data, &raw, ctx); raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); @@ -10787,6 +11129,9 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, void perf_event_free_bpf_prog(struct perf_event *event) { + if (!event->prog) + return; + if (!perf_event_is_tracing(event)) { perf_event_free_bpf_handler(event); return; @@ -10885,6 +11230,17 @@ static void perf_addr_filters_splice(struct perf_event *event, free_filters_list(&list); } +static void perf_free_addr_filters(struct perf_event *event) +{ + /* + * Used during free paths, there is no concurrency. + */ + if (list_empty(&event->addr_filters.list)) + return; + + perf_addr_filters_splice(event, NULL); +} + /* * Scan through mm's vmas and see if one of them matches the * @filter; if so, adjust filter's address range. @@ -11323,8 +11679,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - hwc->hrtimer.function = perf_swevent_hrtimer; + hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); /* * Since hrtimers have a fixed rate, we can do a static freq->period @@ -11561,11 +11916,6 @@ static int perf_event_idx_default(struct perf_event *event) return 0; } -static void free_pmu_context(struct pmu *pmu) -{ - free_percpu(pmu->cpu_pmu_context); -} - /* * Let userspace know that this PMU supports address range filtering: */ @@ -11575,7 +11925,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return sysfs_emit(page, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -11586,7 +11936,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); + return sysfs_emit(page, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -11597,7 +11947,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); + return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); @@ -11628,7 +11978,7 @@ perf_event_mux_interval_ms_store(struct device *dev, cpus_read_lock(); for_each_online_cpu(cpu) { struct perf_cpu_pmu_context *cpc; - cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); @@ -11771,62 +12121,107 @@ del_dev: free_dev: put_device(pmu->dev); + pmu->dev = NULL; goto out; } static struct lock_class_key cpuctx_mutex; static struct lock_class_key cpuctx_lock; -int perf_pmu_register(struct pmu *pmu, const char *name, int type) +static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new) { - int cpu, ret, max = PERF_TYPE_MAX; + void *tmp, *val = idr_find(idr, id); - mutex_lock(&pmus_lock); - ret = -ENOMEM; - pmu->pmu_disable_count = alloc_percpu(int); - if (!pmu->pmu_disable_count) - goto unlock; + if (val != old) + return false; - pmu->type = -1; - if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) { - ret = -EINVAL; - goto free_pdc; + tmp = idr_replace(idr, new, id); + if (IS_ERR(tmp)) + return false; + + WARN_ON_ONCE(tmp != val); + return true; +} + +static void perf_pmu_free(struct pmu *pmu) +{ + if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); } - if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { - ret = -EINVAL; - goto free_pdc; + if (pmu->cpu_pmu_context) { + int cpu; + + for_each_possible_cpu(cpu) { + struct perf_cpu_pmu_context *cpc; + + cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu); + if (!cpc) + continue; + if (cpc->epc.embedded) { + /* refcount managed */ + put_pmu_ctx(&cpc->epc); + continue; + } + kfree(cpc); + } + free_percpu(pmu->cpu_pmu_context); } +} + +DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T)) + +int perf_pmu_register(struct pmu *_pmu, const char *name, int type) +{ + int cpu, max = PERF_TYPE_MAX; + + struct pmu *pmu __free(pmu_unregister) = _pmu; + guard(mutex)(&pmus_lock); + + if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) + return -EINVAL; + + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, + "Can not register a pmu with an invalid scope.\n")) + return -EINVAL; pmu->name = name; if (type >= 0) max = type; - ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); - if (ret < 0) - goto free_pdc; + CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL); + if (pmu_type.id < 0) + return pmu_type.id; - WARN_ON(type >= 0 && ret != type); + WARN_ON(type >= 0 && pmu_type.id != type); - type = ret; - pmu->type = type; + pmu->type = pmu_type.id; + atomic_set(&pmu->exclusive_cnt, 0); if (pmu_bus_running && !pmu->dev) { - ret = pmu_dev_alloc(pmu); + int ret = pmu_dev_alloc(pmu); if (ret) - goto free_idr; + return ret; } - ret = -ENOMEM; - pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); + pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *); if (!pmu->cpu_pmu_context) - goto free_dev; + return -ENOMEM; for_each_possible_cpu(cpu) { - struct perf_cpu_pmu_context *cpc; + struct perf_cpu_pmu_context *cpc = + kmalloc_node(sizeof(struct perf_cpu_pmu_context), + GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); - cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + if (!cpc) + return -ENOMEM; + + *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc; __perf_init_event_pmu_context(&cpc->epc, pmu); __perf_mux_hrtimer_init(cpc, cpu); } @@ -11859,33 +12254,25 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; + /* + * Now that the PMU is complete, make it visible to perf_try_init_event(). + */ + if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu)) + return -EINVAL; list_add_rcu(&pmu->entry, &pmus); - atomic_set(&pmu->exclusive_cnt, 0); - ret = 0; -unlock: - mutex_unlock(&pmus_lock); - return ret; - -free_dev: - if (pmu->dev && pmu->dev != PMU_NULL_DEV) { - device_del(pmu->dev); - put_device(pmu->dev); - } - -free_idr: - idr_remove(&pmu_idr, pmu->type); - -free_pdc: - free_percpu(pmu->pmu_disable_count); - goto unlock; + take_idr_id(pmu_type); + _pmu = no_free_ptr(pmu); // let it rip + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { - mutex_lock(&pmus_lock); - list_del_rcu(&pmu->entry); + scoped_guard (mutex, &pmus_lock) { + list_del_rcu(&pmu->entry); + idr_remove(&pmu_idr, pmu->type); + } /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -11894,16 +12281,7 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_srcu(&pmus_srcu); synchronize_rcu(); - free_percpu(pmu->pmu_disable_count); - idr_remove(&pmu_idr, pmu->type); - if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); - } - free_pmu_context(pmu); - mutex_unlock(&pmus_lock); + perf_pmu_free(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -11943,48 +12321,61 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); - if (!ret) { - if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && - has_extended_regs(event)) - ret = -EOPNOTSUPP; + if (ret) + goto err_pmu; - if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && - event_has_any_exclude_flag(event)) - ret = -EINVAL; + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && + has_extended_regs(event)) { + ret = -EOPNOTSUPP; + goto err_destroy; + } - if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { - const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); - struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); - int cpu; - - if (pmu_cpumask && cpumask) { - cpu = cpumask_any_and(pmu_cpumask, cpumask); - if (cpu >= nr_cpu_ids) - ret = -ENODEV; - else - event->event_caps |= PERF_EV_CAP_READ_SCOPE; - } else { - ret = -ENODEV; - } - } + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && + event_has_any_exclude_flag(event)) { + ret = -EINVAL; + goto err_destroy; + } + + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask; + struct cpumask *pmu_cpumask; + int cpu; + + cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + pmu_cpumask = perf_scope_cpumask(pmu->scope); + + ret = -ENODEV; + if (!pmu_cpumask || !cpumask) + goto err_destroy; - if (ret && event->destroy) - event->destroy(event); + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + goto err_destroy; + + event->event_caps |= PERF_EV_CAP_READ_SCOPE; } - if (ret) - module_put(pmu->module); + return 0; + +err_destroy: + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } +err_pmu: + event->pmu = NULL; + module_put(pmu->module); return ret; } static struct pmu *perf_init_event(struct perf_event *event) { bool extended_type = false; - int idx, type, ret; struct pmu *pmu; + int type, ret; - idx = srcu_read_lock(&pmus_srcu); + guard(srcu)(&pmus_srcu); /* * Save original type before calling pmu->event_init() since certain @@ -11997,7 +12388,7 @@ static struct pmu *perf_init_event(struct perf_event *event) pmu = event->parent->pmu; ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; } /* @@ -12016,13 +12407,12 @@ static struct pmu *perf_init_event(struct perf_event *event) } again: - rcu_read_lock(); - pmu = idr_find(&pmu_idr, type); - rcu_read_unlock(); + scoped_guard (rcu) + pmu = idr_find(&pmu_idr, type); if (pmu) { if (event->attr.type != type && type != PERF_TYPE_RAW && !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) - goto fail; + return ERR_PTR(-ENOENT); ret = perf_try_init_event(pmu, event); if (ret == -ENOENT && event->attr.type != type && !extended_type) { @@ -12031,27 +12421,21 @@ again: } if (ret) - pmu = ERR_PTR(ret); + return ERR_PTR(ret); - goto unlock; + return pmu; } list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { ret = perf_try_init_event(pmu, event); if (!ret) - goto unlock; + return pmu; - if (ret != -ENOENT) { - pmu = ERR_PTR(ret); - goto unlock; - } + if (ret != -ENOENT) + return ERR_PTR(ret); } -fail: - pmu = ERR_PTR(-ENOENT); -unlock: - srcu_read_unlock(&pmus_srcu, idx); - return pmu; + return ERR_PTR(-ENOENT); } static void attach_sb_event(struct perf_event *event) @@ -12178,7 +12562,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, void *context, int cgroup_fd) { struct pmu *pmu; - struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; int node; @@ -12193,8 +12576,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } node = (cpu >= 0) ? cpu_to_node(cpu) : -1; - event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, - node); + struct perf_event *event __free(__free_event) = + kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node); if (!event) return ERR_PTR(-ENOMEM); @@ -12301,15 +12684,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * See perf_output_read(). */ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) - goto err_ns; + return ERR_PTR(-EINVAL); if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; pmu = perf_init_event(event); - if (IS_ERR(pmu)) { - err = PTR_ERR(pmu); - goto err_ns; + if (IS_ERR(pmu)) + return (void*)pmu; + + /* + * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config(). + * The attach should be right after the perf_init_event(). + * Otherwise, the __free_event() would mistakenly detach the non-exist + * perf_ctx_data because of the other errors between them. + */ + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + err = attach_perf_ctx_data(event); + if (err) + return ERR_PTR(err); } /* @@ -12317,49 +12710,39 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * events (they don't make sense as the cgroup will be different * on other CPUs in the uncore mask). */ - if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { - err = -EINVAL; - goto err_pmu; - } + if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) + return ERR_PTR(-EINVAL); if (event->attr.aux_output && (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || - event->attr.aux_pause || event->attr.aux_resume)) { - err = -EOPNOTSUPP; - goto err_pmu; - } + event->attr.aux_pause || event->attr.aux_resume)) + return ERR_PTR(-EOPNOTSUPP); - if (event->attr.aux_pause && event->attr.aux_resume) { - err = -EINVAL; - goto err_pmu; - } + if (event->attr.aux_pause && event->attr.aux_resume) + return ERR_PTR(-EINVAL); if (event->attr.aux_start_paused) { - if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { - err = -EOPNOTSUPP; - goto err_pmu; - } + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return ERR_PTR(-EOPNOTSUPP); event->hw.aux_paused = 1; } if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) - goto err_pmu; + return ERR_PTR(err); } err = exclusive_event_init(event); if (err) - goto err_pmu; + return ERR_PTR(err); if (has_addr_filter(event)) { event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, sizeof(struct perf_addr_filter_range), GFP_KERNEL); - if (!event->addr_filter_ranges) { - err = -ENOMEM; - goto err_per_task; - } + if (!event->addr_filter_ranges) + return ERR_PTR(-ENOMEM); /* * Clone the parent's vma offsets: they are valid until exec() @@ -12383,42 +12766,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(attr->sample_max_stack); if (err) - goto err_addr_filters; + return ERR_PTR(err); + event->attach_state |= PERF_ATTACH_CALLCHAIN; } } err = security_perf_event_alloc(event); if (err) - goto err_callchain_buffer; + return ERR_PTR(err); /* symmetric to unaccount_event() in _free_event() */ account_event(event); - return event; - -err_callchain_buffer: - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } -err_addr_filters: - kfree(event->addr_filter_ranges); - -err_per_task: - exclusive_event_destroy(event); - -err_pmu: - if (is_cgroup_event(event)) - perf_detach_cgroup(event); - if (event->destroy) - event->destroy(event); - module_put(pmu->module); -err_ns: - if (event->hw.target) - put_task_struct(event->hw.target); - call_rcu(&event->rcu_head, free_event_rcu); - - return ERR_PTR(err); + return_ptr(event); } static int perf_copy_attr(struct perf_event_attr __user *uattr, @@ -12488,7 +12848,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, } /* privileged levels capture (kernel, hv): check permissions */ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) { - ret = perf_allow_kernel(attr); + ret = perf_allow_kernel(); if (ret) return ret; } @@ -12745,12 +13105,12 @@ SYSCALL_DEFINE5(perf_event_open, return err; /* Do we allow access to perf_event_open(2) ? */ - err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + err = security_perf_event_open(PERF_SECURITY_OPEN); if (err) return err; if (!attr.exclude_kernel) { - err = perf_allow_kernel(&attr); + err = perf_allow_kernel(); if (err) return err; } @@ -12770,7 +13130,7 @@ SYSCALL_DEFINE5(perf_event_open, /* Only privileged users can get physical addresses */ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) { - err = perf_allow_kernel(&attr); + err = perf_allow_kernel(); if (err) return err; } @@ -13492,6 +13852,12 @@ void perf_event_exit_task(struct task_struct *child) * At this point we need to send EXIT events to cpu contexts. */ perf_event_task(child, NULL, 0); + + /* + * Detach the perf_ctx_data for the system-wide event. + */ + guard(percpu_read)(&global_ctx_data_rwsem); + detach_task_ctx_data(child); } static void perf_free_event(struct perf_event *event, @@ -13603,12 +13969,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) return &event->attr; } -int perf_allow_kernel(struct perf_event_attr *attr) +int perf_allow_kernel(void) { if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) return -EACCES; - return security_perf_event_open(attr, PERF_SECURITY_KERNEL); + return security_perf_event_open(PERF_SECURITY_KERNEL); } EXPORT_SYMBOL_GPL(perf_allow_kernel); @@ -13667,7 +14033,6 @@ inherit_event(struct perf_event *parent_event, if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); - /* task_ctx_data is freed with child_ctx */ free_event(child_event); return NULL; } @@ -13925,6 +14290,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags) child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); + child->perf_ctx_data = NULL; ret = perf_event_init_context(child, clone_flags); if (ret) { diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bc4a61029b6d..8ec2cb688903 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -950,9 +950,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp) return -ENOENT; /* - * no branch sampling for breakpoint events + * Check if breakpoint type is supported before proceeding. + * Also, no branch sampling for breakpoint events. */ - if (has_branch_stack(bp)) + if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp)) return -EOPNOTSUPP; err = register_perf_hw_breakpoint(bp); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 4f46f688d0d4..5130b119d0ae 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -19,7 +19,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->rb->poll, EPOLLIN); + atomic_set(&handle->rb->poll, EPOLLIN | EPOLLRDNORM); handle->event->pending_wakeup = 1; @@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; + handle->flags = 0; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { @@ -643,7 +644,6 @@ static void rb_free_aux_page(struct perf_buffer *rb, int idx) struct page *page = virt_to_page(rb->aux_pages[idx]); ClearPagePrivate(page); - page->mapping = NULL; __free_page(page); } @@ -819,7 +819,6 @@ static void perf_mmap_free_page(void *addr) { struct page *page = virt_to_page(addr); - page->mapping = NULL; __free_page(page); } @@ -890,28 +889,13 @@ __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); } -static void perf_mmap_unmark_page(void *addr) -{ - struct page *page = vmalloc_to_page(addr); - - page->mapping = NULL; -} - static void rb_free_work(struct work_struct *work) { struct perf_buffer *rb; - void *base; - int i, nr; rb = container_of(work, struct perf_buffer, work); - nr = data_page_nr(rb); - - base = rb->user_page; - /* The '<=' counts in the user page. */ - for (i = 0; i <= nr; i++) - perf_mmap_unmark_page(base + (i * PAGE_SIZE)); - vfree(base); + vfree(rb->user_page); kfree(rb); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5d71ef85420c..70c84b9d7be3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -28,6 +28,7 @@ #include <linux/rcupdate_trace.h> #include <linux/workqueue.h> #include <linux/srcu.h> +#include <linux/oom.h> /* check_stable_address_space */ #include <linux/uprobes.h> @@ -416,7 +417,7 @@ static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " - "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n", + "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) uprobe->ref_ctr_offset, mm); @@ -494,6 +495,11 @@ retry: if (ret <= 0) goto put_old; + if (is_zero_page(old_page)) { + ret = -EINVAL; + goto put_old; + } + if (WARN(!is_register && PageCompound(old_page), "uprobe unregister should never work on compound page\n")) { ret = -EINVAL; @@ -761,10 +767,14 @@ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) enum hprobe_state hstate; /* - * return_instance's hprobe is protected by RCU. - * Underlying uprobe is itself protected from reuse by SRCU. + * Caller should guarantee that return_instance is not going to be + * freed from under us. This can be achieved either through holding + * rcu_read_lock() or by owning return_instance in the first place. + * + * Underlying uprobe is itself protected from reuse by SRCU, so ensure + * SRCU lock is held properly. */ - lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu)); + lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); hstate = READ_ONCE(hprobe->state); switch (hstate) { @@ -1260,6 +1270,9 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) * returns NULL in find_active_uprobe_rcu(). */ mmap_write_lock(mm); + if (check_stable_address_space(mm)) + goto unlock; + vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || file_inode(vma->vm_file) != uprobe->inode) @@ -1888,9 +1901,33 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } -static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) +static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri) +{ + ri->cons_cnt = 0; + ri->next = utask->ri_pool; + utask->ri_pool = ri; +} + +static struct return_instance *ri_pool_pop(struct uprobe_task *utask) +{ + struct return_instance *ri = utask->ri_pool; + + if (likely(ri)) + utask->ri_pool = ri->next; + + return ri; +} + +static void ri_free(struct return_instance *ri) { - struct return_instance *next = ri->next; + kfree(ri->extra_consumers); + kfree_rcu(ri, rcu); +} + +static void free_ret_instance(struct uprobe_task *utask, + struct return_instance *ri, bool cleanup_hprobe) +{ + unsigned seq; if (cleanup_hprobe) { enum hprobe_state hstate; @@ -1899,8 +1936,22 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo hprobe_finalize(&ri->hprobe, hstate); } - kfree_rcu(ri, rcu); - return next; + /* + * At this point return_instance is unlinked from utask's + * return_instances list and this has become visible to ri_timer(). + * If seqcount now indicates that ri_timer's return instance + * processing loop isn't active, we can return ri into the pool of + * to-be-reused return instances for future uretprobes. If ri_timer() + * happens to be running right now, though, we fallback to safety and + * just perform RCU-delated freeing of ri. + */ + if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { + /* immediate reuse of ri without RCU GP is OK */ + ri_pool_push(utask, ri); + } else { + /* we might be racing with ri_timer(), so play it safe */ + ri_free(ri); + } } /* @@ -1910,7 +1961,7 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - struct return_instance *ri; + struct return_instance *ri, *ri_next; if (!utask) return; @@ -1921,8 +1972,19 @@ void uprobe_free_utask(struct task_struct *t) timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; - while (ri) - ri = free_ret_instance(ri, true /* cleanup_hprobe */); + while (ri) { + ri_next = ri->next; + free_ret_instance(utask, ri, true /* cleanup_hprobe */); + ri = ri_next; + } + + /* free_ret_instance() above might add to ri_pool, so this loop should come last */ + ri = utask->ri_pool; + while (ri) { + ri_next = ri->next; + ri_free(ri); + ri = ri_next; + } kfree(utask); } @@ -1942,8 +2004,12 @@ static void ri_timer(struct timer_list *timer) /* RCU protects return_instance from freeing. */ guard(rcu)(); + write_seqcount_begin(&utask->ri_seqcount); + for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); + + write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) @@ -1955,6 +2021,7 @@ static struct uprobe_task *alloc_utask(void) return NULL; timer_setup(&utask->ri_timer, ri_timer, 0); + seqcount_init(&utask->ri_seqcount); return utask; } @@ -1974,32 +2041,40 @@ static struct uprobe_task *get_utask(void) return current->utask; } -static size_t ri_size(int consumers_cnt) +static struct return_instance *alloc_return_instance(struct uprobe_task *utask) { struct return_instance *ri; - return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt; -} - -#define DEF_CNT 4 - -static struct return_instance *alloc_return_instance(void) -{ - struct return_instance *ri; + ri = ri_pool_pop(utask); + if (ri) + return ri; - ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL); + ri = kzalloc(sizeof(*ri), GFP_KERNEL); if (!ri) return ZERO_SIZE_PTR; - ri->consumers_cnt = DEF_CNT; return ri; } static struct return_instance *dup_return_instance(struct return_instance *old) { - size_t size = ri_size(old->consumers_cnt); + struct return_instance *ri; + + ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); + if (!ri) + return NULL; + + if (unlikely(old->cons_cnt > 1)) { + ri->extra_consumers = kmemdup(old->extra_consumers, + sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1), + GFP_KERNEL); + if (!ri->extra_consumers) { + kfree(ri); + return NULL; + } + } - return kmemdup(old, size, GFP_KERNEL); + return ri; } static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) @@ -2094,8 +2169,8 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) */ unsigned long uprobe_get_trampoline_vaddr(void) { + unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR; struct xol_area *area; - unsigned long trampoline_vaddr = -1; /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ @@ -2108,14 +2183,17 @@ unsigned long uprobe_get_trampoline_vaddr(void) static void cleanup_return_instances(struct uprobe_task *utask, bool chained, struct pt_regs *regs) { - struct return_instance *ri = utask->return_instances; + struct return_instance *ri = utask->return_instances, *ri_next; enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { - ri = free_ret_instance(ri, true /* cleanup_hprobe */); + ri_next = ri->next; + rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; + + free_ret_instance(utask, ri, true /* cleanup_hprobe */); + ri = ri_next; } - rcu_assign_pointer(utask->return_instances, ri); } static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, @@ -2180,7 +2258,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, return; free: - kfree(ri); + ri_free(ri); } /* Prepare to single-step probed instruction out of line. */ @@ -2233,9 +2311,8 @@ bool uprobe_deny_signal(void) WARN_ON_ONCE(utask->state != UTASK_SSTEP); if (task_sigpending(t)) { - spin_lock_irq(&t->sighand->siglock); + utask->signal_denied = true; clear_tsk_thread_flag(t, TIF_SIGPENDING); - spin_unlock_irq(&t->sighand->siglock); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; @@ -2294,6 +2371,47 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) return is_trap_insn(&opcode); } +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr) +{ + struct mm_struct *mm = current->mm; + struct uprobe *uprobe = NULL; + struct vm_area_struct *vma; + struct file *vm_file; + loff_t offset; + unsigned int seq; + + guard(rcu)(); + + if (!mmap_lock_speculate_try_begin(mm, &seq)) + return NULL; + + vma = vma_lookup(mm, bp_vaddr); + if (!vma) + return NULL; + + /* + * vm_file memory can be reused for another instance of struct file, + * but can't be freed from under us, so it's safe to read fields from + * it, even if the values are some garbage values; ultimately + * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure + * that whatever we speculatively found is correct + */ + vm_file = READ_ONCE(vma->vm_file); + if (!vm_file) + return NULL; + + offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start); + uprobe = find_uprobe_rcu(vm_file->f_inode, offset); + if (!uprobe) + return NULL; + + /* now double check that nothing about MM changed */ + if (mmap_lock_speculate_retry(mm, seq)) + return NULL; + + return uprobe; +} + /* assumes being inside RCU protected region */ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { @@ -2301,10 +2419,14 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb struct uprobe *uprobe = NULL; struct vm_area_struct *vma; + uprobe = find_active_uprobe_speculative(bp_vaddr); + if (uprobe) + return uprobe; + mmap_read_lock(mm); vma = vma_lookup(mm, bp_vaddr); if (vma) { - if (valid_vma(vma, false)) { + if (vma->vm_file) { struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); @@ -2324,25 +2446,27 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb return uprobe; } -static struct return_instance* -push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie) +static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie) { + struct return_consumer *ric; + if (unlikely(ri == ZERO_SIZE_PTR)) return ri; - if (unlikely(idx >= ri->consumers_cnt)) { - struct return_instance *old_ri = ri; - - ri->consumers_cnt += DEF_CNT; - ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL); - if (!ri) { - kfree(old_ri); + if (unlikely(ri->cons_cnt > 0)) { + ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); + if (!ric) { + ri_free(ri); return ZERO_SIZE_PTR; } + ri->extra_consumers = ric; } - ri->consumers[idx].id = id; - ri->consumers[idx].cookie = cookie; + ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1]; + ric->id = id; + ric->cookie = cookie; + + ri->cons_cnt++; return ri; } @@ -2350,14 +2474,17 @@ static struct return_consumer * return_consumer_find(struct return_instance *ri, int *iter, int id) { struct return_consumer *ric; - int idx = *iter; + int idx; - for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) { + for (idx = *iter; idx < ri->cons_cnt; idx++) + { + ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1]; if (ric->id == id) { *iter = idx + 1; return ric; } } + return NULL; } @@ -2371,9 +2498,9 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) struct uprobe_consumer *uc; bool has_consumers = false, remove = true; struct return_instance *ri = NULL; - int push_idx = 0; + struct uprobe_task *utask = current->utask; - current->utask->auprobe = &uprobe->arch; + utask->auprobe = &uprobe->arch; list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; @@ -2393,21 +2520,15 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) continue; if (!ri) - ri = alloc_return_instance(); + ri = alloc_return_instance(utask); if (session) - ri = push_consumer(ri, push_idx++, uc->id, cookie); + ri = push_consumer(ri, uc->id, cookie); } - current->utask->auprobe = NULL; + utask->auprobe = NULL; - if (!ZERO_OR_NULL_PTR(ri)) { - /* - * The push_idx value has the final number of return consumers, - * and ri->consumers_cnt has number of allocated consumers. - */ - ri->consumers_cnt = push_idx; + if (!ZERO_OR_NULL_PTR(ri)) prepare_uretprobe(uprobe, regs, ri); - } if (remove && has_consumers) { down_read(&uprobe->register_rwsem); @@ -2461,7 +2582,7 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri) void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; - struct return_instance *ri, *next; + struct return_instance *ri, *ri_next, *next_chain; struct uprobe *uprobe; enum hprobe_state hstate; bool valid; @@ -2481,8 +2602,8 @@ void uprobe_handle_trampoline(struct pt_regs *regs) * or NULL; the latter case means that nobody but ri->func * could hit this trampoline on return. TODO: sigaltstack(). */ - next = find_next_ret_chain(ri); - valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); + next_chain = find_next_ret_chain(ri); + valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { @@ -2494,7 +2615,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs) * trampoline addresses on the stack are replaced with correct * original return addresses */ - rcu_assign_pointer(utask->return_instances, ri->next); + ri_next = ri->next; + rcu_assign_pointer(utask->return_instances, ri_next); + utask->depth--; uprobe = hprobe_consume(&ri->hprobe, &hstate); if (valid) @@ -2502,9 +2625,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs) hprobe_finalize(&ri->hprobe, hstate); /* We already took care of hprobe, no need to waste more time on that. */ - ri = free_ret_instance(ri, false /* !cleanup_hprobe */); - utask->depth--; - } while (ri != next); + free_ret_instance(utask, ri, false /* !cleanup_hprobe */); + ri = ri_next; + } while (ri != next_chain); } while (!valid); return; @@ -2622,9 +2745,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) utask->state = UTASK_RUNNING; xol_free_insn_slot(utask); - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* see uprobe_deny_signal() */ - spin_unlock_irq(¤t->sighand->siglock); + if (utask->signal_denied) { + set_thread_flag(TIF_SIGPENDING); + utask->signal_denied = false; + } if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); diff --git a/kernel/exit.c b/kernel/exit.c index 1dcddfe537ee..c2e6c7b7779f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -69,6 +69,7 @@ #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/uaccess.h> +#include <linux/pidfs.h> #include <uapi/linux/wait.h> @@ -85,7 +86,7 @@ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL -static struct ctl_table kern_exit_table[] = { +static const struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, @@ -122,14 +123,22 @@ static __init int kernel_exit_sysfs_init(void) late_initcall(kernel_exit_sysfs_init); #endif -static void __unhash_process(struct task_struct *p, bool group_dead) +/* + * For things release_task() would like to do *after* tasklist_lock is released. + */ +struct release_task_post { + struct pid *pids[PIDTYPE_MAX]; +}; + +static void __unhash_process(struct release_task_post *post, struct task_struct *p, + bool group_dead) { nr_threads--; - detach_pid(p, PIDTYPE_PID); + detach_pid(post->pids, p, PIDTYPE_PID); if (group_dead) { - detach_pid(p, PIDTYPE_TGID); - detach_pid(p, PIDTYPE_PGID); - detach_pid(p, PIDTYPE_SID); + detach_pid(post->pids, p, PIDTYPE_TGID); + detach_pid(post->pids, p, PIDTYPE_PGID); + detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); @@ -141,7 +150,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) /* * This function expects the tasklist_lock write-locked. */ -static void __exit_signal(struct task_struct *tsk) +static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); @@ -174,9 +183,6 @@ static void __exit_signal(struct task_struct *tsk) sig->curr_target = next_thread(tsk); } - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, - sizeof(unsigned long long)); - /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, @@ -197,23 +203,15 @@ static void __exit_signal(struct task_struct *tsk) task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; - __unhash_process(tsk, group_dead); + __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); - /* - * Do this under ->siglock, we can race with another thread - * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. - */ - flush_sigqueue(&tsk->pending); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk, TIF_SIGPENDING); - if (group_dead) { - flush_sigqueue(&sig->shared_pending); + if (group_dead) tty_kref_put(tty); - } } static void delayed_put_task_struct(struct rcu_head *rhp) @@ -239,22 +237,27 @@ void __weak release_thread(struct task_struct *dead_task) void release_task(struct task_struct *p) { + struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: + memset(&post, 0, sizeof(post)); + /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); + pidfs_exit(p); cgroup_release(p); + thread_pid = get_pid(p->thread_pid); + write_lock_irq(&tasklist_lock); ptrace_release_task(p); - thread_pid = get_pid(p->thread_pid); - __exit_signal(p); + __exit_signal(&post, p); /* * If we are the last non-leader member of the thread @@ -278,7 +281,20 @@ repeat: write_unlock_irq(&tasklist_lock); proc_flush_pid(thread_pid); put_pid(thread_pid); + add_device_randomness(&p->se.sum_exec_runtime, + sizeof(p->se.sum_exec_runtime)); + free_pids(post.pids); release_thread(p); + /* + * This task was already removed from the process/thread/pid lists + * and lock_task_sighand(p) can't succeed. Nobody else can touch + * ->pending or, if group dead, signal->shared_pending. We can call + * flush_sigqueue() lockless. + */ + flush_sigqueue(&p->pending); + if (thread_group_leader(p)) + flush_sigqueue(&p->signal->shared_pending); + put_task_struct_rcu_user(p); p = leader; @@ -741,10 +757,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->exit_state = EXIT_ZOMBIE; /* - * sub-thread or delay_group_leader(), wake up the - * PIDFD_THREAD waiters. + * Ignore thread-group leaders that exited before all + * subthreads did. */ - if (!thread_group_empty(tsk)) + if (!delay_group_leader(tsk)) do_notify_pidfd(tsk); if (unlikely(tsk->ptrace)) { diff --git a/kernel/fork.c b/kernel/fork.c index 9b301180fd41..a61a4407ebdf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -448,7 +448,7 @@ static bool vma_lock_alloc(struct vm_area_struct *vma) return false; init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return true; } @@ -625,8 +625,8 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) * We depend on the oldmm having properly denied write access to the * exe_file already. */ - if (exe_file && deny_write_access(exe_file)) - pr_warn_once("deny_write_access() failed in %s\n", __func__); + if (exe_file && exe_file_deny_write_access(exe_file)) + pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__); } #ifdef CONFIG_MMU @@ -760,7 +760,8 @@ loop_out: mt_set_in_rcu(vmi.mas.tree); ksm_fork(mm, oldmm); khugepaged_fork(mm, oldmm); - } else if (mpnt) { + } else { + /* * The entire maple tree has already been duplicated. If the * mmap duplication fails, mark the failure point with @@ -768,8 +769,18 @@ loop_out: * stop releasing VMAs that have not been duplicated after this * point. */ - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); - mas_store(&vmi.mas, XA_ZERO_ENTRY); + if (mpnt) { + mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); + mas_store(&vmi.mas, XA_ZERO_ENTRY); + /* Avoid OOM iterating a broken tree */ + set_bit(MMF_OOM_SKIP, &mm->flags); + } + /* + * The mm_struct is going to exit, but the locks will be dropped + * first. Set the mm_struct as unstable is advisable as it is + * not fully initialised. + */ + set_bit(MMF_UNSTABLE, &mm->flags); } out: mmap_write_unlock(mm); @@ -1262,9 +1273,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); -#ifdef CONFIG_PER_VMA_LOCK - mm->mm_lock_seq = 0; -#endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; @@ -1419,13 +1427,13 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) * We expect the caller (i.e., sys_execve) to already denied * write access, so this is unlikely to fail. */ - if (unlikely(deny_write_access(new_exe_file))) + if (unlikely(exe_file_deny_write_access(new_exe_file))) return -EACCES; get_file(new_exe_file); } rcu_assign_pointer(mm->exe_file, new_exe_file); if (old_exe_file) { - allow_write_access(old_exe_file); + exe_file_allow_write_access(old_exe_file); fput(old_exe_file); } return 0; @@ -1466,7 +1474,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) return ret; } - ret = deny_write_access(new_exe_file); + ret = exe_file_deny_write_access(new_exe_file); if (ret) return -EACCES; get_file(new_exe_file); @@ -1478,7 +1486,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) mmap_write_unlock(mm); if (old_exe_file) { - allow_write_access(old_exe_file); + exe_file_allow_write_access(old_exe_file); fput(old_exe_file); } return 0; @@ -1514,12 +1522,13 @@ struct file *get_task_exe_file(struct task_struct *task) struct file *exe_file = NULL; struct mm_struct *mm; + if (task->flags & PF_KTHREAD) + return NULL; + task_lock(task); mm = task->mm; - if (mm) { - if (!(task->flags & PF_KTHREAD)) - exe_file = get_mm_exe_file(mm); - } + if (mm) + exe_file = get_mm_exe_file(mm); task_unlock(task); return exe_file; } @@ -1882,8 +1891,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) #ifdef CONFIG_POSIX_TIMERS INIT_HLIST_HEAD(&sig->posix_timers); INIT_HLIST_HEAD(&sig->ignored_posix_timers); - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->real_timer.function = it_real_fn; + hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); #endif task_lock(current->group_leader); @@ -2023,25 +2031,18 @@ static inline void rcu_copy_process(struct task_struct *p) */ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - int pidfd; struct file *pidfd_file; - pidfd = get_unused_fd_flags(O_CLOEXEC); + CLASS(get_unused_fd, pidfd)(O_CLOEXEC); if (pidfd < 0) return pidfd; pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); - if (IS_ERR(pidfd_file)) { - put_unused_fd(pidfd); + if (IS_ERR(pidfd_file)) return PTR_ERR(pidfd_file); - } - /* - * anon_inode_getfile() ignores everything outside of the - * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually. - */ - pidfd_file->f_flags |= (flags & PIDFD_THREAD); + *ret = pidfd_file; - return pidfd; + return take_fd(pidfd); } /** @@ -2423,8 +2424,11 @@ __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; - /* Note that no task has been attached to @pid yet. */ - retval = __pidfd_prepare(pid, flags, &pidfile); + /* + * Note that no task has been attached to @pid yet indicate + * that via CLONE_PIDFD. + */ + retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; diff --git a/kernel/futex/core.c b/kernel/futex/core.c index ebdd76b4ecbb..cca15859a50b 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -50,10 +50,10 @@ */ static struct { struct futex_hash_bucket *queues; - unsigned long hashsize; + unsigned long hashmask; } __futex_data __read_mostly __aligned(2*sizeof(long)); #define futex_queues (__futex_data.queues) -#define futex_hashsize (__futex_data.hashsize) +#define futex_hashmask (__futex_data.hashmask) /* @@ -119,7 +119,7 @@ struct futex_hash_bucket *futex_hash(union futex_key *key) u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); - return &futex_queues[hash & (futex_hashsize - 1)]; + return &futex_queues[hash & futex_hashmask]; } @@ -532,7 +532,8 @@ void futex_q_unlock(struct futex_hash_bucket *hb) futex_hb_waiters_dec(hb); } -void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) +void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, + struct task_struct *task) { int prio; @@ -548,7 +549,7 @@ void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); - q->task = current; + q->task = task; } /** @@ -1126,27 +1127,28 @@ void futex_exit_release(struct task_struct *tsk) static int __init futex_init(void) { + unsigned long hashsize, i; unsigned int futex_shift; - unsigned long i; #ifdef CONFIG_BASE_SMALL - futex_hashsize = 16; + hashsize = 16; #else - futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); + hashsize = roundup_pow_of_two(256 * num_possible_cpus()); #endif futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, 0, + hashsize, 0, 0, &futex_shift, NULL, - futex_hashsize, futex_hashsize); - futex_hashsize = 1UL << futex_shift; + hashsize, hashsize); + hashsize = 1UL << futex_shift; - for (i = 0; i < futex_hashsize; i++) { + for (i = 0; i < hashsize; i++) { atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } + futex_hashmask = hashsize - 1; return 0; } core_initcall(futex_init); diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 99b32e728c4a..6b2f4c7eb720 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -285,13 +285,15 @@ static inline int futex_get_value_locked(u32 *dest, u32 __user *from) } extern void __futex_unqueue(struct futex_q *q); -extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb); +extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, + struct task_struct *task); extern int futex_unqueue(struct futex_q *q); /** * futex_queue() - Enqueue the futex_q on the futex_hash_bucket * @q: The futex_q to enqueue * @hb: The destination hash bucket + * @task: Task queueing this futex * * The hb->lock must be held by the caller, and is released here. A call to * futex_queue() is typically paired with exactly one call to futex_unqueue(). The @@ -299,11 +301,14 @@ extern int futex_unqueue(struct futex_q *q); * or nothing if the unqueue is done as part of the wake process and the unqueue * state is implicit in the state of woken task (see futex_wait_requeue_pi() for * an example). + * + * Note that @task may be NULL, for async usage of futexes. */ -static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) +static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb, + struct task_struct *task) __releases(&hb->lock) { - __futex_queue(q, hb); + __futex_queue(q, hb, task); spin_unlock(&hb->lock); } diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index d62cca5ed8f4..7a941845f7ee 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -982,7 +982,7 @@ retry_private: /* * Only actually queue now that the atomic ops are done: */ - __futex_queue(&q, hb); + __futex_queue(&q, hb, current); if (trylock) { ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); @@ -1020,10 +1020,7 @@ retry_private: * it sees the futex_q::pi_state. */ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); - preempt_disable(); - raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); - wake_up_q(&wake_q); - preempt_enable(); + raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); if (ret) { if (ret == 1) diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 3a10375d9521..25877d4f2f8f 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -210,13 +210,12 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { if (oparg < 0 || oparg > 31) { - char comm[sizeof(current->comm)]; /* * kill this print and return -EINVAL when userspace * is sane again */ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", - get_task_comm(comm, current), oparg); + current->comm, oparg); oparg &= 31; } oparg = 1 << oparg; @@ -350,7 +349,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * access to the hash list and forcing another memory barrier. */ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); - futex_queue(q, hb); + futex_queue(q, hb, current); /* Arm the timer */ if (timeout) @@ -461,7 +460,7 @@ retry: * next futex. Queue each futex at this moment so hb can * be unlocked. */ - futex_queue(q, hb); + futex_queue(q, hb, current); continue; } diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index 7670a811a565..8b888a6193cc 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -264,10 +264,10 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) /** * gcov_info_add - add up profiling data - * @dest: profiling data set to which data is added - * @source: profiling data set which is added + * @dst: profiling data set to which data is added + * @src: profiling data set which is added * - * Adds profiling counts of @source to @dest. + * Adds profiling counts of @src to @dst. */ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) { diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 7e1340da5aca..00529c81cc40 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -7,20 +7,13 @@ set -e sfile="$(readlink -f "$0")" outdir="$(pwd)" tarfile=$1 -cpio_dir=$outdir/${tarfile%/*}/.tmp_cpio_dir +tmpdir=$outdir/${tarfile%/*}/.tmp_dir dir_list=" include/ arch/$SRCARCH/include/ " -if ! command -v cpio >/dev/null; then - echo >&2 "***" - echo >&2 "*** 'cpio' could not be found." - echo >&2 "***" - exit 1 -fi - # Support incremental builds by skipping archive generation # if timestamps of files being archived are not changed. @@ -48,9 +41,9 @@ all_dirs="$all_dirs $dir_list" # check include/generated/autoconf.h explicitly. # # Ignore them for md5 calculation to avoid pointless regeneration. -headers_md5="$(find $all_dirs -name "*.h" | - grep -v "include/generated/utsversion.h" | - grep -v "include/generated/autoconf.h" | +headers_md5="$(find $all_dirs -name "*.h" -a \ + ! -path include/generated/utsversion.h -a \ + ! -path include/generated/autoconf.h | xargs ls -l | md5sum | cut -d ' ' -f1)" # Any changes to this script will also cause a rebuild of the archive. @@ -65,36 +58,43 @@ fi echo " GEN $tarfile" -rm -rf $cpio_dir -mkdir $cpio_dir +rm -rf "${tmpdir}" +mkdir "${tmpdir}" if [ "$building_out_of_srctree" ]; then ( cd $srctree for f in $dir_list do find "$f" -name "*.h"; - done | cpio --quiet -pd $cpio_dir + done | tar -c -f - -T - | tar -xf - -C "${tmpdir}" ) fi -# The second CPIO can complain if files already exist which can happen with out -# of tree builds having stale headers in srctree. Just silence CPIO for now. for f in $dir_list; do find "$f" -name "*.h"; -done | cpio --quiet -pdu $cpio_dir >/dev/null 2>&1 +done | tar -c -f - -T - | tar -xf - -C "${tmpdir}" + +# Always exclude include/generated/utsversion.h +# Otherwise, the contents of the tarball may vary depending on the build steps. +rm -f "${tmpdir}/include/generated/utsversion.h" # Remove comments except SDPX lines -find $cpio_dir -type f -print0 | - xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' +# Use a temporary file to store directory contents to prevent find/xargs from +# seeing temporary files created by perl. +find "${tmpdir}" -type f -print0 > "${tmpdir}.contents.txt" +xargs -0 -P8 -n1 \ + perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' \ + < "${tmpdir}.contents.txt" +rm -f "${tmpdir}.contents.txt" # Create archive and try to normalize metadata for reproducibility. tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ --exclude=".__afs*" --exclude=".nfs*" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ - -I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null + -I $XZ -cf $tarfile -C "${tmpdir}/" . > /dev/null echo $headers_md5 > kernel/kheaders.md5 echo "$this_file_md5" >> kernel/kheaders.md5 echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 -rm -rf $cpio_dir +rm -rf "${tmpdir}" diff --git a/kernel/hung_task.c b/kernel/hung_task.c index c18717189f32..04efa7a6e69b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -147,6 +147,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); + if (t->flags & PF_POSTCOREDUMP) + pr_err(" Blocked by coredump.\n"); pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); @@ -272,7 +274,7 @@ static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int writ * and hung_task_check_interval_secs */ static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); -static struct ctl_table hung_task_sysctls[] = { +static const struct ctl_table hung_task_sysctls[] = { #ifdef CONFIG_SMP { .procname = "hung_task_all_cpu_backtrace", diff --git a/kernel/iomem.c b/kernel/iomem.c index dc2120776e1c..75e61c1c6bc0 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -6,7 +6,8 @@ #include <linux/ioremap.h> #ifndef arch_memremap_wb -static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +static void *arch_memremap_wb(resource_size_t offset, unsigned long size, + unsigned long flags) { #ifdef ioremap_cache return (__force void *)ioremap_cache(offset, size); @@ -91,7 +92,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size, flags); if (!addr) - addr = arch_memremap_wb(offset, size); + addr = arch_memremap_wb(offset, size, flags); } /* diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 529adb1f5859..875f25ed6f71 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -141,6 +141,12 @@ config GENERIC_IRQ_DEBUGFS If you don't know what to do here, say N. +# Clear forwarded VM interrupts during kexec. +# This option ensures the kernel clears active states for interrupts +# forwarded to virtual machines (VMs) during a machine kexec. +config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD + bool + endmenu config GENERIC_IRQ_MULTI_HANDLER diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index f19d3080bf11..c0f44c06d69d 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o +obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o kexec.o obj-$(CONFIG_IRQ_TIMINGS) += timings.o ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y) CFLAGS_timings.o += -DDEBUG diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 271e9139de77..0ff987d3a799 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -232,6 +232,21 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, } #endif +static void irq_enable(struct irq_desc *desc) +{ + if (!irqd_irq_disabled(&desc->irq_data)) { + unmask_irq(desc); + } else { + irq_state_clr_disabled(desc); + if (desc->irq_data.chip->irq_enable) { + desc->irq_data.chip->irq_enable(&desc->irq_data); + irq_state_clr_masked(desc); + } else { + unmask_irq(desc); + } + } +} + static int __irq_startup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); @@ -332,21 +347,6 @@ void irq_shutdown_and_deactivate(struct irq_desc *desc) irq_domain_deactivate_irq(&desc->irq_data); } -void irq_enable(struct irq_desc *desc) -{ - if (!irqd_irq_disabled(&desc->irq_data)) { - unmask_irq(desc); - } else { - irq_state_clr_disabled(desc); - if (desc->irq_data.chip->irq_enable) { - desc->irq_data.chip->irq_enable(&desc->irq_data); - irq_state_clr_masked(desc); - } else { - unmask_irq(desc); - } - } -} - static void __irq_disable(struct irq_desc *desc, bool mask) { if (irqd_irq_disabled(&desc->irq_data)) { @@ -1114,13 +1114,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) trigger = irqd_get_trigger_type(&desc->irq_data); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | - IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); + IRQD_TRIGGER_MASK | IRQD_LEVEL); if (irq_settings_has_no_balance_set(desc)) irqd_set(&desc->irq_data, IRQD_NO_BALANCING); if (irq_settings_is_per_cpu(desc)) irqd_set(&desc->irq_data, IRQD_PER_CPU); - if (irq_settings_can_move_pcntxt(desc)) - irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index c6ffb97966be..ca142b9a4db3 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -53,6 +53,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI), BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND), BIT_MASK_DESCR(IRQCHIP_IMMUTABLE), + BIT_MASK_DESCR(IRQCHIP_MOVE_DEFERRED), }; static void @@ -108,7 +109,6 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_NO_BALANCING), BIT_MASK_DESCR(IRQD_SINGLE_TARGET), - BIT_MASK_DESCR(IRQD_MOVE_PCNTXT), BIT_MASK_DESCR(IRQD_AFFINITY_SET), BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 32ffcbb87fa1..c4a8bca5f2b0 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -162,6 +162,7 @@ void irq_gc_mask_disable_and_ack_set(struct irq_data *d) irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_mask_disable_and_ack_set); /** * irq_gc_eoi - EOI interrupt diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fe0272cd84a5..b0290849c395 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -90,7 +90,6 @@ extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); extern void irq_shutdown_and_deactivate(struct irq_desc *desc); -extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); @@ -98,18 +97,12 @@ extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); extern void unmask_threaded_irq(struct irq_desc *desc); -extern unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask); - #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } #else extern void irq_mark_irq(unsigned int irq); #endif -extern int __irq_get_irqchip_state(struct irq_data *data, - enum irqchip_irq_state which, - bool *state); - irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); @@ -139,8 +132,6 @@ static inline void unregister_handler_proc(unsigned int irq, extern bool irq_can_set_affinity_usr(unsigned int irq); -extern void irq_set_thread_affinity(struct irq_desc *desc); - extern int irq_do_set_affinity(struct irq_data *data, const struct cpumask *dest, bool force); @@ -421,7 +412,7 @@ irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, #ifdef CONFIG_GENERIC_PENDING_IRQ static inline bool irq_can_move_pcntxt(struct irq_data *data) { - return irqd_can_move_in_process_context(data); + return !(data->chip->flags & IRQCHIP_MOVE_DEFERRED); } static inline bool irq_move_pending(struct irq_data *data) { @@ -441,11 +432,8 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) { return desc->pending_mask; } -static inline bool handle_enforce_irqctx(struct irq_data *data) -{ - return irqd_is_handle_enforce_irqctx(data); -} bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); +void irq_force_complete_move(struct irq_desc *desc); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) { @@ -471,10 +459,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) { return false; } -static inline bool handle_enforce_irqctx(struct irq_data *data) -{ - return false; -} +static inline void irq_force_complete_move(struct irq_desc *desc) { } #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0253e77fcd9a..4258cd6bd3b4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -708,7 +708,7 @@ int handle_irq_desc(struct irq_desc *desc) return -EINVAL; data = irq_desc_get_irq_data(desc); - if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data))) + if (WARN_ON_ONCE(!in_hardirq() && irqd_is_handle_enforce_irqctx(data))) return -EPERM; generic_handle_irq_desc(desc); @@ -991,7 +991,7 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } -unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) +static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { unsigned int sum = 0; int cpu; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index ec6d8e72d980..2861f89880af 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1589,9 +1589,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, } } -int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg) +static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, + unsigned int nr_irqs, void *arg) { if (!domain->ops->alloc) { pr_debug("domain->ops->alloc() is NULL\n"); diff --git a/kernel/irq/kexec.c b/kernel/irq/kexec.c new file mode 100644 index 000000000000..1a3deffe6b5b --- /dev/null +++ b/kernel/irq/kexec.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/irqnr.h> + +#include "internals.h" + +void machine_kexec_mask_interrupts(void) +{ + struct irq_desc *desc; + unsigned int i; + + for_each_irq_desc(i, desc) { + struct irq_chip *chip; + int check_eoi = 1; + + chip = irq_desc_get_chip(desc); + if (!chip || !irqd_is_started(&desc->irq_data)) + continue; + + if (IS_ENABLED(CONFIG_GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD)) { + /* + * First try to remove the active state from an interrupt which is forwarded + * to a VM. If the interrupt is not forwarded, try to EOI the interrupt. + */ + check_eoi = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false); + } + + if (check_eoi && chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data)) + chip->irq_eoi(&desc->irq_data); + + irq_shutdown(desc); + } +} diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f0803d6bd296..753eef8e041c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -35,6 +35,8 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif +static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state); + static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { struct irq_data *irqd = irq_desc_get_irq_data(desc); @@ -187,7 +189,7 @@ bool irq_can_set_affinity_usr(unsigned int irq) * set_cpus_allowed_ptr() here as we hold desc->lock and this * code can be called from hard interrupt context. */ -void irq_set_thread_affinity(struct irq_desc *desc) +static void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action; @@ -1182,45 +1184,38 @@ out_unlock: } /* - * Interrupts which are not explicitly requested as threaded - * interrupts rely on the implicit bh/preempt disable of the hard irq - * context. So we need to disable bh here to avoid deadlocks and other - * side effects. + * Interrupts explicitly requested as threaded interrupts want to be + * preemptible - many of them need to sleep and wait for slow busses to + * complete. */ -static irqreturn_t -irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) +static irqreturn_t irq_thread_fn(struct irq_desc *desc, struct irqaction *action) { - irqreturn_t ret; + irqreturn_t ret = action->thread_fn(action->irq, action->dev_id); - local_bh_disable(); - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_disable(); - ret = action->thread_fn(action->irq, action->dev_id); if (ret == IRQ_HANDLED) atomic_inc(&desc->threads_handled); irq_finalize_oneshot(desc, action); - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_enable(); - local_bh_enable(); return ret; } /* - * Interrupts explicitly requested as threaded interrupts want to be - * preemptible - many of them need to sleep and wait for slow busses to - * complete. + * Interrupts which are not explicitly requested as threaded + * interrupts rely on the implicit bh/preempt disable of the hard irq + * context. So we need to disable bh here to avoid deadlocks and other + * side effects. */ -static irqreturn_t irq_thread_fn(struct irq_desc *desc, - struct irqaction *action) +static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret; - ret = action->thread_fn(action->irq, action->dev_id); - if (ret == IRQ_HANDLED) - atomic_inc(&desc->threads_handled); - - irq_finalize_oneshot(desc, action); + local_bh_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); + ret = irq_thread_fn(desc, action); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); + local_bh_enable(); return ret; } @@ -2796,8 +2791,7 @@ out: irq_put_desc_unlock(desc, flags); } -int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, - bool *state) +static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state) { struct irq_chip *chip; int err = -EINVAL; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index eb150afd671f..147cabb4c077 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -35,6 +35,16 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) return true; } +void irq_force_complete_move(struct irq_desc *desc) +{ + for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = d->parent_data) { + if (d->chip && d->chip->irq_force_complete_move) { + d->chip->irq_force_complete_move(d); + return; + } + } +} + void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); @@ -117,3 +127,13 @@ void __irq_move_irq(struct irq_data *idata) if (!masked) idata->chip->irq_unmask(idata); } + +bool irq_can_move_in_process_context(struct irq_data *data) +{ + /* + * Get the top level irq_data in the hierarchy, which is optimized + * away when CONFIG_IRQ_DOMAIN_HIERARCHY is disabled. + */ + data = irq_desc_get_irq_data(irq_data_to_desc(data)); + return irq_can_move_pcntxt(data); +} diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 396a067a8a56..1951a08f0421 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -15,6 +15,7 @@ #include <linux/mutex.h> #include <linux/pci.h> #include <linux/slab.h> +#include <linux/seq_file.h> #include <linux/sysfs.h> #include <linux/types.h> #include <linux/xarray.h> @@ -269,16 +270,11 @@ fail: return ret; } -void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) -{ - *msg = entry->msg; -} - void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) { struct msi_desc *entry = irq_get_msi_desc(irq); - __get_cached_msi_msg(entry, msg); + *msg = entry->msg; } EXPORT_SYMBOL_GPL(get_cached_msi_msg); @@ -342,26 +338,30 @@ int msi_setup_device_data(struct device *dev) } /** - * msi_lock_descs - Lock the MSI descriptor storage of a device + * __msi_lock_descs - Lock the MSI descriptor storage of a device * @dev: Device to operate on + * + * Internal function for guard(msi_descs_lock). Don't use in code. */ -void msi_lock_descs(struct device *dev) +void __msi_lock_descs(struct device *dev) { mutex_lock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(msi_lock_descs); +EXPORT_SYMBOL_GPL(__msi_lock_descs); /** - * msi_unlock_descs - Unlock the MSI descriptor storage of a device + * __msi_unlock_descs - Unlock the MSI descriptor storage of a device * @dev: Device to operate on + * + * Internal function for guard(msi_descs_lock). Don't use in code. */ -void msi_unlock_descs(struct device *dev) +void __msi_unlock_descs(struct device *dev) { /* Invalidate the index which was cached by the iterator */ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX; mutex_unlock(&dev->msi.data->mutex); } -EXPORT_SYMBOL_GPL(msi_unlock_descs); +EXPORT_SYMBOL_GPL(__msi_unlock_descs); static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid, enum msi_desc_filter filter) @@ -447,7 +447,6 @@ EXPORT_SYMBOL_GPL(msi_next_desc); unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index) { struct msi_desc *desc; - unsigned int ret = 0; bool pcimsi = false; struct xarray *xa; @@ -461,7 +460,7 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN) pcimsi = to_pci_dev(dev)->msi_enabled; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); xa = &dev->msi.data->__domains[domid].store; desc = xa_load(xa, pcimsi ? 0 : index); if (desc && desc->irq) { @@ -470,16 +469,12 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne * PCI-MSIX and platform MSI use a descriptor per * interrupt. */ - if (pcimsi) { - if (index < desc->nvec_used) - ret = desc->irq + index; - } else { - ret = desc->irq; - } + if (!pcimsi) + return desc->irq; + if (index < desc->nvec_used) + return desc->irq + index; } - - msi_unlock_descs(dev); - return ret; + return 0; } EXPORT_SYMBOL_GPL(msi_domain_get_virq); @@ -756,12 +751,30 @@ static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fw return info->ops->msi_translate(domain, fwspec, hwirq, type); } +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS +static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d, + struct irq_data *irqd, int ind) +{ + struct msi_desc *desc = irq_data_get_msi_desc(irqd); + + if (!desc) + return; + + seq_printf(m, "\n%*saddress_hi: 0x%08x", ind + 1, "", desc->msg.address_hi); + seq_printf(m, "\n%*saddress_lo: 0x%08x", ind + 1, "", desc->msg.address_lo); + seq_printf(m, "\n%*smsg_data: 0x%08x\n", ind + 1, "", desc->msg.data); +} +#endif + static const struct irq_domain_ops msi_domain_ops = { .alloc = msi_domain_alloc, .free = msi_domain_free, .activate = msi_domain_activate, .deactivate = msi_domain_deactivate, .translate = msi_domain_translate, +#ifdef CONFIG_GENERIC_IRQ_DEBUGFS + .debug_show = msi_domain_debug_show, +#endif }; static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, @@ -979,9 +992,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, void *chip_data) { struct irq_domain *domain, *parent = dev->msi.domain; - struct fwnode_handle *fwnode, *fwnalloced = NULL; - struct msi_domain_template *bundle; const struct msi_parent_ops *pops; + struct fwnode_handle *fwnode; if (!irq_domain_is_msi_parent(parent)) return false; @@ -989,7 +1001,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, if (domid >= MSI_MAX_DEVICE_IRQDOMAINS) return false; - bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); + struct msi_domain_template *bundle __free(kfree) = + bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); if (!bundle) return false; @@ -1012,41 +1025,36 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, * node as they are not guaranteed to have a fwnode. They are never * looked up and always handled in the context of the device. */ - if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE) - fwnode = dev->fwnode; + struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL; + + if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE)) + fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name); else - fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name); + fwnode = dev->fwnode; if (!fwnode) - goto free_bundle; + return false; if (msi_setup_device_data(dev)) - goto free_fwnode; - - msi_lock_descs(dev); + return false; + guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid))) - goto fail; + return false; if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info)) - goto fail; + return false; domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent); if (!domain) - goto fail; + return false; + /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */ + retain_ptr(bundle); + retain_ptr(fwnode_alloced); domain->dev = dev; dev->msi.data->__domains[domid].domain = domain; - msi_unlock_descs(dev); return true; - -fail: - msi_unlock_descs(dev); -free_fwnode: - irq_domain_free_fwnode(fwnalloced); -free_bundle: - kfree(bundle); - return false; } /** @@ -1060,12 +1068,10 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) struct msi_domain_info *info; struct irq_domain *domain; - msi_lock_descs(dev); - + guard(msi_descs_lock)(dev); domain = msi_get_device_domain(dev, domid); - if (!domain || !irq_domain_is_msi_device(domain)) - goto unlock; + return; dev->msi.data->__domains[domid].domain = NULL; info = domain->host_data; @@ -1074,9 +1080,6 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) irq_domain_remove(domain); irq_domain_free_fwnode(fwnode); kfree(container_of(info, struct msi_domain_template, info)); - -unlock: - msi_unlock_descs(dev); } /** @@ -1092,16 +1095,14 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, { struct msi_domain_info *info; struct irq_domain *domain; - bool ret = false; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); domain = msi_get_device_domain(dev, domid); if (domain && irq_domain_is_msi_device(domain)) { info = domain->host_data; - ret = info->bus_token == bus_token; + return info->bus_token == bus_token; } - msi_unlock_descs(dev); - return ret; + return false; } static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, @@ -1143,7 +1144,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) return false; - if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) + if (info->flags & MSI_FLAG_NO_MASK) return false; /* @@ -1333,21 +1334,17 @@ static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl) } /** - * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain + * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain * @dev: Pointer to device struct of the device for which the interrupts * are allocated * @domid: Id of the interrupt domain to operate on * @first: First index to allocate (inclusive) * @last: Last index to allocate (inclusive) * - * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() - * pair. Use this for MSI irqdomains which implement their own descriptor - * allocation/free. - * * Return: %0 on success or an error code. */ -int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last) +int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) { struct msi_ctrl ctrl = { .domid = domid, @@ -1356,29 +1353,9 @@ int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, .nirqs = last + 1 - first, }; + guard(msi_descs_lock)(dev); return msi_domain_alloc_locked(dev, &ctrl); } - -/** - * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain - * @dev: Pointer to device struct of the device for which the interrupts - * are allocated - * @domid: Id of the interrupt domain to operate on - * @first: First index to allocate (inclusive) - * @last: Last index to allocate (inclusive) - * - * Return: %0 on success or an error code. - */ -int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last) -{ - int ret; - - msi_lock_descs(dev); - ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last); - msi_unlock_descs(dev); - return ret; -} EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range); /** @@ -1481,12 +1458,8 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u const struct irq_affinity_desc *affdesc, union msi_instance_cookie *icookie) { - struct msi_map map; - - msi_lock_descs(dev); - map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); - msi_unlock_descs(dev); - return map; + guard(msi_descs_lock)(dev); + return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie); } /** @@ -1523,13 +1496,11 @@ int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq, icookie.value = ((u64)type << 32) | hwirq; - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain)) map.index = -EINVAL; else map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie); - msi_unlock_descs(dev); - return map.index >= 0 ? map.virq : map.index; } @@ -1599,8 +1570,8 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) * @first: First index to free (inclusive) * @last: Last index to free (inclusive) */ -void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, - unsigned int first, unsigned int last) +static void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) { struct msi_ctrl ctrl = { .domid = domid, @@ -1622,9 +1593,8 @@ void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, unsigned int first, unsigned int last) { - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_domain_free_irqs_range_locked(dev, domid, first, last); - msi_unlock_descs(dev); } EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all); @@ -1654,9 +1624,8 @@ void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid) */ void msi_domain_free_irqs_all(struct device *dev, unsigned int domid) { - msi_lock_descs(dev); + guard(msi_descs_lock)(dev); msi_domain_free_irqs_all_locked(dev, domid); - msi_unlock_descs(dev); } /** @@ -1675,12 +1644,11 @@ void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq) if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI)) return; - msi_lock_descs(dev); - if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) { - msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, - desc->msi_index); - } - msi_unlock_descs(dev); + guard(msi_descs_lock)(dev); + if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) + return; + msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index, + desc->msi_index); } /** diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index b07a2d732ffb..1b7fa72968bd 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -53,7 +53,7 @@ static int irq_sw_resend(struct irq_desc *desc) * Validate whether this interrupt can be safely injected from * non interrupt context */ - if (handle_enforce_irqctx(&desc->irq_data)) + if (irqd_is_handle_enforce_irqctx(&desc->irq_data)) return -EINVAL; /* diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 7b7efb1a114b..00b3bd127692 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -11,7 +11,6 @@ enum { _IRQ_NOREQUEST = IRQ_NOREQUEST, _IRQ_NOTHREAD = IRQ_NOTHREAD, _IRQ_NOAUTOEN = IRQ_NOAUTOEN, - _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, _IRQ_NO_BALANCING = IRQ_NO_BALANCING, _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, @@ -142,11 +141,6 @@ static inline void irq_settings_set_noprobe(struct irq_desc *desc) desc->status_use_accessors |= _IRQ_NOPROBE; } -static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) -{ - return desc->status_use_accessors & _IRQ_MOVE_PCNTXT; -} - static inline bool irq_settings_can_autoenable(struct irq_desc *desc) { return !(desc->status_use_accessors & _IRQ_NOAUTOEN); diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index c43e2ac2f8de..4b7315e99bd6 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -509,6 +509,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) /** * irq_timings_next_event - Return when the next event is supposed to arrive + * @now: current time * * During the last busy cycle, the number of interrupts is incremented * and stored in the irq_timings structure. This information is diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 2f4fb336dda1..73f7e1fd4ab4 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -147,7 +147,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (!irq_work_claim(work)) return false; - kasan_record_aux_stack_noalloc(work); + kasan_record_aux_stack(work); preempt_disable(); if (cpu != smp_processor_id()) { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index a9a0ca605d4a..4198f30aac3c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -148,16 +148,8 @@ static unsigned int get_symbol_offset(unsigned long pos) unsigned long kallsyms_sym_address(int idx) { - /* values are unsigned offsets if --absolute-percpu is not in effect */ - if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) - return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; - - /* ...otherwise, positive offsets are absolute values */ - if (kallsyms_offsets[idx] >= 0) - return kallsyms_offsets[idx]; - - /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ - return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; + /* values are unsigned offsets */ + return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; } static unsigned int get_symbol_seq(int index) diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index 873f7c445488..cf4af5728307 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -435,13 +435,11 @@ static int __init kallsyms_test_init(void) { struct task_struct *t; - t = kthread_create(test_entry, NULL, "kallsyms_test"); + t = kthread_run_on_cpu(test_entry, NULL, 0, "kallsyms_test"); if (IS_ERR(t)) { pr_info("Create kallsyms selftest task failed\n"); return PTR_ERR(t); } - kthread_bind(t, 0); - wake_up_process(t); return 0; } diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 2c596851f8a9..7c1a65bd5f8d 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -145,7 +145,7 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, */ task1 = find_task_by_vpid(pid1); task2 = find_task_by_vpid(pid2); - if (!task1 || !task2) + if (unlikely(!task1 || !task2)) goto err_no_task; get_task_struct(task1); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c0caa14880c3..c0bdc1686154 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -925,7 +925,7 @@ static int kexec_limit_handler(const struct ctl_table *table, int write, return proc_dointvec(&tmp, write, buffer, lenp, ppos); } -static struct ctl_table kexec_core_sysctls[] = { +static const struct ctl_table kexec_core_sysctls[] = { { .procname = "kexec_load_disabled", .data = &kexec_load_disabled, @@ -1001,6 +1001,12 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { + /* + * This flow is analogous to hibernation flows that occur + * before creating an image and before jumping from the + * restore kernel to the image one, so it uses the same + * device callbacks as those two flows. + */ pm_prepare_console(); error = freeze_processes(); if (error) { @@ -1011,12 +1017,10 @@ int kernel_kexec(void) error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; - /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_end(). We *must* call - * dpm_suspend_end() now. Otherwise, drivers for - * some devices (e.g. interrupt controllers) become - * desynchronized with the actual state of the - * hardware at resume time, and evil weirdness ensues. + /* + * dpm_suspend_end() must be called after dpm_suspend_start() + * to complete the transition, like in the hibernation flows + * mentioned above. */ error = dpm_suspend_end(PMSG_FREEZE); if (error) @@ -1052,6 +1056,13 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { + /* + * This flow is analogous to hibernation flows that occur after + * creating an image and after the image kernel has got control + * back, and in case the devices have been reset or otherwise + * manipulated in the meantime, it uses the device callbacks + * used by the latter. + */ syscore_resume(); Enable_irqs: local_irq_enable(); diff --git a/kernel/kheaders.c b/kernel/kheaders.c index 42163c9e94e5..378088b07f46 100644 --- a/kernel/kheaders.c +++ b/kernel/kheaders.c @@ -29,25 +29,12 @@ asm ( extern char kernel_headers_data[]; extern char kernel_headers_data_end[]; -static ssize_t -ikheaders_read(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t len) -{ - memcpy(buf, &kernel_headers_data[off], len); - return len; -} - -static struct bin_attribute kheaders_attr __ro_after_init = { - .attr = { - .name = "kheaders.tar.xz", - .mode = 0444, - }, - .read = &ikheaders_read, -}; +static struct bin_attribute kheaders_attr __ro_after_init = + __BIN_ATTR_SIMPLE_RO(kheaders.tar.xz, 0444); static int __init ikheaders_init(void) { + kheaders_attr.private = kernel_headers_data; kheaders_attr.size = (kernel_headers_data_end - kernel_headers_data); return sysfs_create_bin_file(kernel_kobj, &kheaders_attr); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b027a4030976..88aeac84e4c0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -39,6 +39,7 @@ #include <linux/static_call.h> #include <linux/perf_event.h> #include <linux/execmem.h> +#include <linux/cleanup.h> #include <asm/sections.h> #include <asm/cacheflush.h> @@ -140,45 +141,39 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c); kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; - kprobe_opcode_t *slot = NULL; /* Since the slot array is not protected by rcu, we need a mutex */ - mutex_lock(&c->mutex); - retry: - rcu_read_lock(); - list_for_each_entry_rcu(kip, &c->pages, list) { - if (kip->nused < slots_per_page(c)) { - int i; - - for (i = 0; i < slots_per_page(c); i++) { - if (kip->slot_used[i] == SLOT_CLEAN) { - kip->slot_used[i] = SLOT_USED; - kip->nused++; - slot = kip->insns + (i * c->insn_size); - rcu_read_unlock(); - goto out; + guard(mutex)(&c->mutex); + do { + guard(rcu)(); + list_for_each_entry_rcu(kip, &c->pages, list) { + if (kip->nused < slots_per_page(c)) { + int i; + + for (i = 0; i < slots_per_page(c); i++) { + if (kip->slot_used[i] == SLOT_CLEAN) { + kip->slot_used[i] = SLOT_USED; + kip->nused++; + return kip->insns + (i * c->insn_size); + } } + /* kip->nused is broken. Fix it. */ + kip->nused = slots_per_page(c); + WARN_ON(1); } - /* kip->nused is broken. Fix it. */ - kip->nused = slots_per_page(c); - WARN_ON(1); } - } - rcu_read_unlock(); - /* If there are any garbage slots, collect it and try again. */ - if (c->nr_garbage && collect_garbage_slots(c) == 0) - goto retry; + } while (c->nr_garbage && collect_garbage_slots(c) == 0); /* All out of space. Need to allocate a new page. */ kip = kmalloc(struct_size(kip, slot_used, slots_per_page(c)), GFP_KERNEL); if (!kip) - goto out; + return NULL; kip->insns = c->alloc(); if (!kip->insns) { kfree(kip); - goto out; + return NULL; } INIT_LIST_HEAD(&kip->list); memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); @@ -187,14 +182,12 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->ngarbage = 0; kip->cache = c; list_add_rcu(&kip->list, &c->pages); - slot = kip->insns; /* Record the perf ksymbol register event after adding the page */ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, PAGE_SIZE, false, c->sym); -out: - mutex_unlock(&c->mutex); - return slot; + + return kip->insns; } /* Return true if all garbages are collected, otherwise false. */ @@ -249,25 +242,35 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c) return 0; } -void __free_insn_slot(struct kprobe_insn_cache *c, - kprobe_opcode_t *slot, int dirty) +static long __find_insn_page(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, struct kprobe_insn_page **pkip) { - struct kprobe_insn_page *kip; + struct kprobe_insn_page *kip = NULL; long idx; - mutex_lock(&c->mutex); - rcu_read_lock(); + guard(rcu)(); list_for_each_entry_rcu(kip, &c->pages, list) { idx = ((long)slot - (long)kip->insns) / (c->insn_size * sizeof(kprobe_opcode_t)); - if (idx >= 0 && idx < slots_per_page(c)) - goto out; + if (idx >= 0 && idx < slots_per_page(c)) { + *pkip = kip; + return idx; + } } /* Could not find this slot. */ WARN_ON(1); - kip = NULL; -out: - rcu_read_unlock(); + *pkip = NULL; + return -1; +} + +void __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) +{ + struct kprobe_insn_page *kip = NULL; + long idx; + + guard(mutex)(&c->mutex); + idx = __find_insn_page(c, slot, &kip); /* Mark and sweep: this may sleep */ if (kip) { /* Check double free */ @@ -281,7 +284,6 @@ out: collect_one_slot(kip, idx); } } - mutex_unlock(&c->mutex); } /* @@ -600,47 +602,43 @@ static void kick_kprobe_optimizer(void) /* Kprobe jump optimizer */ static void kprobe_optimizer(struct work_struct *work) { - mutex_lock(&kprobe_mutex); - cpus_read_lock(); - mutex_lock(&text_mutex); + guard(mutex)(&kprobe_mutex); - /* - * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) - * kprobes before waiting for quiesence period. - */ - do_unoptimize_kprobes(); + scoped_guard(cpus_read_lock) { + guard(mutex)(&text_mutex); - /* - * Step 2: Wait for quiesence period to ensure all potentially - * preempted tasks to have normally scheduled. Because optprobe - * may modify multiple instructions, there is a chance that Nth - * instruction is preempted. In that case, such tasks can return - * to 2nd-Nth byte of jump instruction. This wait is for avoiding it. - * Note that on non-preemptive kernel, this is transparently converted - * to synchronoze_sched() to wait for all interrupts to have completed. - */ - synchronize_rcu_tasks(); + /* + * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) + * kprobes before waiting for quiesence period. + */ + do_unoptimize_kprobes(); - /* Step 3: Optimize kprobes after quiesence period */ - do_optimize_kprobes(); + /* + * Step 2: Wait for quiesence period to ensure all potentially + * preempted tasks to have normally scheduled. Because optprobe + * may modify multiple instructions, there is a chance that Nth + * instruction is preempted. In that case, such tasks can return + * to 2nd-Nth byte of jump instruction. This wait is for avoiding it. + * Note that on non-preemptive kernel, this is transparently converted + * to synchronoze_sched() to wait for all interrupts to have completed. + */ + synchronize_rcu_tasks(); - /* Step 4: Free cleaned kprobes after quiesence period */ - do_free_cleaned_kprobes(); + /* Step 3: Optimize kprobes after quiesence period */ + do_optimize_kprobes(); - mutex_unlock(&text_mutex); - cpus_read_unlock(); + /* Step 4: Free cleaned kprobes after quiesence period */ + do_free_cleaned_kprobes(); + } /* Step 5: Kick optimizer again if needed */ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) kick_kprobe_optimizer(); - - mutex_unlock(&kprobe_mutex); } -/* Wait for completing optimization and unoptimization */ -void wait_for_kprobe_optimizer(void) +static void wait_for_kprobe_optimizer_locked(void) { - mutex_lock(&kprobe_mutex); + lockdep_assert_held(&kprobe_mutex); while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { mutex_unlock(&kprobe_mutex); @@ -652,8 +650,14 @@ void wait_for_kprobe_optimizer(void) mutex_lock(&kprobe_mutex); } +} - mutex_unlock(&kprobe_mutex); +/* Wait for completing optimization and unoptimization */ +void wait_for_kprobe_optimizer(void) +{ + guard(mutex)(&kprobe_mutex); + + wait_for_kprobe_optimizer_locked(); } bool optprobe_queued_unopt(struct optimized_kprobe *op) @@ -852,29 +856,24 @@ static void try_to_optimize_kprobe(struct kprobe *p) return; /* For preparing optimization, jump_label_text_reserved() is called. */ - cpus_read_lock(); - jump_label_lock(); - mutex_lock(&text_mutex); + guard(cpus_read_lock)(); + guard(jump_label_lock)(); + guard(mutex)(&text_mutex); ap = alloc_aggr_kprobe(p); if (!ap) - goto out; + return; op = container_of(ap, struct optimized_kprobe, kp); if (!arch_prepared_optinsn(&op->optinsn)) { /* If failed to setup optimizing, fallback to kprobe. */ arch_remove_optimized_kprobe(op); kfree(op); - goto out; + return; } init_aggr_kprobe(ap, p); optimize_kprobe(ap); /* This just kicks optimizer thread. */ - -out: - mutex_unlock(&text_mutex); - jump_label_unlock(); - cpus_read_unlock(); } static void optimize_all_kprobes(void) @@ -883,10 +882,10 @@ static void optimize_all_kprobes(void) struct kprobe *p; unsigned int i; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* If optimization is already allowed, just return. */ if (kprobes_allow_optimization) - goto out; + return; cpus_read_lock(); kprobes_allow_optimization = true; @@ -898,8 +897,6 @@ static void optimize_all_kprobes(void) } cpus_read_unlock(); pr_info("kprobe jump-optimization is enabled. All kprobes are optimized if possible.\n"); -out: - mutex_unlock(&kprobe_mutex); } #ifdef CONFIG_SYSCTL @@ -909,12 +906,10 @@ static void unoptimize_all_kprobes(void) struct kprobe *p; unsigned int i; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* If optimization is already prohibited, just return. */ - if (!kprobes_allow_optimization) { - mutex_unlock(&kprobe_mutex); + if (!kprobes_allow_optimization) return; - } cpus_read_lock(); kprobes_allow_optimization = false; @@ -926,10 +921,8 @@ static void unoptimize_all_kprobes(void) } } cpus_read_unlock(); - mutex_unlock(&kprobe_mutex); - /* Wait for unoptimizing completion. */ - wait_for_kprobe_optimizer(); + wait_for_kprobe_optimizer_locked(); pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n"); } @@ -941,7 +934,7 @@ static int proc_kprobes_optimization_handler(const struct ctl_table *table, { int ret; - mutex_lock(&kprobe_sysctl_mutex); + guard(mutex)(&kprobe_sysctl_mutex); sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -949,12 +942,11 @@ static int proc_kprobes_optimization_handler(const struct ctl_table *table, optimize_all_kprobes(); else unoptimize_all_kprobes(); - mutex_unlock(&kprobe_sysctl_mutex); return ret; } -static struct ctl_table kprobe_sysctls[] = { +static const struct ctl_table kprobe_sysctls[] = { { .procname = "kprobes-optimization", .data = &sysctl_kprobes_optimization, @@ -1024,7 +1016,8 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) #define __arm_kprobe(p) arch_arm_kprobe(p) #define __disarm_kprobe(p, o) arch_disarm_kprobe(p) #define kprobe_disarmed(p) kprobe_disabled(p) -#define wait_for_kprobe_optimizer() do {} while (0) +#define wait_for_kprobe_optimizer_locked() \ + lockdep_assert_held(&kprobe_mutex) static int reuse_unused_kprobe(struct kprobe *ap) { @@ -1078,20 +1071,18 @@ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, if (*cnt == 0) { ret = register_ftrace_function(ops); - if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) - goto err_ftrace; + if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) { + /* + * At this point, sinec ops is not registered, we should be sefe from + * registering empty filter. + */ + ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0); + return ret; + } } (*cnt)++; return ret; - -err_ftrace: - /* - * At this point, sinec ops is not registered, we should be sefe from - * registering empty filter. - */ - ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0); - return ret; } static int arm_kprobe_ftrace(struct kprobe *p) @@ -1163,12 +1154,9 @@ static int arm_kprobe(struct kprobe *kp) if (unlikely(kprobe_ftrace(kp))) return arm_kprobe_ftrace(kp); - cpus_read_lock(); - mutex_lock(&text_mutex); + guard(cpus_read_lock)(); + guard(mutex)(&text_mutex); __arm_kprobe(kp); - mutex_unlock(&text_mutex); - cpus_read_unlock(); - return 0; } @@ -1177,12 +1165,9 @@ static int disarm_kprobe(struct kprobe *kp, bool reopt) if (unlikely(kprobe_ftrace(kp))) return disarm_kprobe_ftrace(kp); - cpus_read_lock(); - mutex_lock(&text_mutex); + guard(cpus_read_lock)(); + guard(mutex)(&text_mutex); __disarm_kprobe(kp, reopt); - mutex_unlock(&text_mutex); - cpus_read_unlock(); - return 0; } @@ -1299,62 +1284,55 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) int ret = 0; struct kprobe *ap = orig_p; - cpus_read_lock(); - - /* For preparing optimization, jump_label_text_reserved() is called */ - jump_label_lock(); - mutex_lock(&text_mutex); - - if (!kprobe_aggrprobe(orig_p)) { - /* If 'orig_p' is not an 'aggr_kprobe', create new one. */ - ap = alloc_aggr_kprobe(orig_p); - if (!ap) { - ret = -ENOMEM; - goto out; + scoped_guard(cpus_read_lock) { + /* For preparing optimization, jump_label_text_reserved() is called */ + guard(jump_label_lock)(); + guard(mutex)(&text_mutex); + + if (!kprobe_aggrprobe(orig_p)) { + /* If 'orig_p' is not an 'aggr_kprobe', create new one. */ + ap = alloc_aggr_kprobe(orig_p); + if (!ap) + return -ENOMEM; + init_aggr_kprobe(ap, orig_p); + } else if (kprobe_unused(ap)) { + /* This probe is going to die. Rescue it */ + ret = reuse_unused_kprobe(ap); + if (ret) + return ret; } - init_aggr_kprobe(ap, orig_p); - } else if (kprobe_unused(ap)) { - /* This probe is going to die. Rescue it */ - ret = reuse_unused_kprobe(ap); - if (ret) - goto out; - } - if (kprobe_gone(ap)) { - /* - * Attempting to insert new probe at the same location that - * had a probe in the module vaddr area which already - * freed. So, the instruction slot has already been - * released. We need a new slot for the new probe. - */ - ret = arch_prepare_kprobe(ap); - if (ret) + if (kprobe_gone(ap)) { /* - * Even if fail to allocate new slot, don't need to - * free the 'ap'. It will be used next time, or - * freed by unregister_kprobe(). + * Attempting to insert new probe at the same location that + * had a probe in the module vaddr area which already + * freed. So, the instruction slot has already been + * released. We need a new slot for the new probe. */ - goto out; - - /* Prepare optimized instructions if possible. */ - prepare_optimized_kprobe(ap); + ret = arch_prepare_kprobe(ap); + if (ret) + /* + * Even if fail to allocate new slot, don't need to + * free the 'ap'. It will be used next time, or + * freed by unregister_kprobe(). + */ + return ret; - /* - * Clear gone flag to prevent allocating new slot again, and - * set disabled flag because it is not armed yet. - */ - ap->flags = (ap->flags & ~KPROBE_FLAG_GONE) - | KPROBE_FLAG_DISABLED; - } + /* Prepare optimized instructions if possible. */ + prepare_optimized_kprobe(ap); - /* Copy the insn slot of 'p' to 'ap'. */ - copy_kprobe(ap, p); - ret = add_new_kprobe(ap, p); + /* + * Clear gone flag to prevent allocating new slot again, and + * set disabled flag because it is not armed yet. + */ + ap->flags = (ap->flags & ~KPROBE_FLAG_GONE) + | KPROBE_FLAG_DISABLED; + } -out: - mutex_unlock(&text_mutex); - jump_label_unlock(); - cpus_read_unlock(); + /* Copy the insn slot of 'p' to 'ap'. */ + copy_kprobe(ap, p); + ret = add_new_kprobe(ap, p); + } if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; @@ -1448,7 +1426,7 @@ _kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name, unsigned long offset, bool *on_func_entry) { if ((symbol_name && addr) || (!symbol_name && !addr)) - goto invalid; + return ERR_PTR(-EINVAL); if (symbol_name) { /* @@ -1478,16 +1456,16 @@ _kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name, * at the start of the function. */ addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry); - if (addr) - return addr; + if (!addr) + return ERR_PTR(-EINVAL); -invalid: - return ERR_PTR(-EINVAL); + return addr; } static kprobe_opcode_t *kprobe_addr(struct kprobe *p) { bool on_func_entry; + return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry); } @@ -1505,15 +1483,15 @@ static struct kprobe *__get_valid_kprobe(struct kprobe *p) if (unlikely(!ap)) return NULL; - if (p != ap) { - list_for_each_entry(list_p, &ap->list, list) - if (list_p == p) - /* kprobe p is a valid probe */ - goto valid; - return NULL; - } -valid: - return ap; + if (p == ap) + return ap; + + list_for_each_entry(list_p, &ap->list, list) + if (list_p == p) + /* kprobe p is a valid probe */ + return ap; + + return NULL; } /* @@ -1522,14 +1500,12 @@ valid: */ static inline int warn_kprobe_rereg(struct kprobe *p) { - int ret = 0; + guard(mutex)(&kprobe_mutex); - mutex_lock(&kprobe_mutex); if (WARN_ON_ONCE(__get_valid_kprobe(p))) - ret = -EINVAL; - mutex_unlock(&kprobe_mutex); + return -EINVAL; - return ret; + return 0; } static int check_ftrace_location(struct kprobe *p) @@ -1565,17 +1541,23 @@ static int check_kprobe_address_safe(struct kprobe *p, ret = check_ftrace_location(p); if (ret) return ret; - jump_label_lock(); - preempt_disable(); + + guard(jump_label_lock)(); /* Ensure the address is in a text area, and find a module if exists. */ *probed_mod = NULL; if (!core_kernel_text((unsigned long) p->addr)) { + guard(preempt)(); *probed_mod = __module_text_address((unsigned long) p->addr); - if (!(*probed_mod)) { - ret = -EINVAL; - goto out; - } + if (!(*probed_mod)) + return -EINVAL; + + /* + * We must hold a refcount of the probed module while updating + * its code to prohibit unexpected unloading. + */ + if (unlikely(!try_module_get(*probed_mod))) + return -ENOENT; } /* Ensure it is not in reserved area. */ if (in_gate_area_no_mm((unsigned long) p->addr) || @@ -1584,49 +1566,71 @@ static int check_kprobe_address_safe(struct kprobe *p, static_call_text_reserved(p->addr, p->addr) || find_bug((unsigned long)p->addr) || is_cfi_preamble_symbol((unsigned long)p->addr)) { - ret = -EINVAL; - goto out; + module_put(*probed_mod); + return -EINVAL; } /* Get module refcount and reject __init functions for loaded modules. */ if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) { /* - * We must hold a refcount of the probed module while updating - * its code to prohibit unexpected unloading. - */ - if (unlikely(!try_module_get(*probed_mod))) { - ret = -ENOENT; - goto out; - } - - /* * If the module freed '.init.text', we couldn't insert * kprobes in there. */ if (within_module_init((unsigned long)p->addr, *probed_mod) && !module_is_coming(*probed_mod)) { module_put(*probed_mod); - *probed_mod = NULL; - ret = -ENOENT; + return -ENOENT; } } -out: - preempt_enable(); - jump_label_unlock(); + return 0; +} - return ret; +static int __register_kprobe(struct kprobe *p) +{ + int ret; + struct kprobe *old_p; + + guard(mutex)(&kprobe_mutex); + + old_p = get_kprobe(p->addr); + if (old_p) + /* Since this may unoptimize 'old_p', locking 'text_mutex'. */ + return register_aggr_kprobe(old_p, p); + + scoped_guard(cpus_read_lock) { + /* Prevent text modification */ + guard(mutex)(&text_mutex); + ret = prepare_kprobe(p); + if (ret) + return ret; + } + + INIT_HLIST_NODE(&p->hlist); + hlist_add_head_rcu(&p->hlist, + &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); + + if (!kprobes_all_disarmed && !kprobe_disabled(p)) { + ret = arm_kprobe(p); + if (ret) { + hlist_del_rcu(&p->hlist); + synchronize_rcu(); + } + } + + /* Try to optimize kprobe */ + try_to_optimize_kprobe(p); + return 0; } int register_kprobe(struct kprobe *p) { int ret; - struct kprobe *old_p; struct module *probed_mod; kprobe_opcode_t *addr; bool on_func_entry; - /* Adjust probe address from symbol */ + /* Canonicalize probe address from symbol */ addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry); if (IS_ERR(addr)) return PTR_ERR(addr); @@ -1638,6 +1642,8 @@ int register_kprobe(struct kprobe *p) /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ p->flags &= KPROBE_FLAG_DISABLED; + if (on_func_entry) + p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY; p->nmissed = 0; INIT_LIST_HEAD(&p->list); @@ -1645,44 +1651,7 @@ int register_kprobe(struct kprobe *p) if (ret) return ret; - mutex_lock(&kprobe_mutex); - - if (on_func_entry) - p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY; - - old_p = get_kprobe(p->addr); - if (old_p) { - /* Since this may unoptimize 'old_p', locking 'text_mutex'. */ - ret = register_aggr_kprobe(old_p, p); - goto out; - } - - cpus_read_lock(); - /* Prevent text modification */ - mutex_lock(&text_mutex); - ret = prepare_kprobe(p); - mutex_unlock(&text_mutex); - cpus_read_unlock(); - if (ret) - goto out; - - INIT_HLIST_NODE(&p->hlist); - hlist_add_head_rcu(&p->hlist, - &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - - if (!kprobes_all_disarmed && !kprobe_disabled(p)) { - ret = arm_kprobe(p); - if (ret) { - hlist_del_rcu(&p->hlist); - synchronize_rcu(); - goto out; - } - } - - /* Try to optimize kprobe */ - try_to_optimize_kprobe(p); -out: - mutex_unlock(&kprobe_mutex); + ret = __register_kprobe(p); if (probed_mod) module_put(probed_mod); @@ -1761,29 +1730,31 @@ static int __unregister_kprobe_top(struct kprobe *p) if (IS_ERR(ap)) return PTR_ERR(ap); - if (ap == p) - /* - * This probe is an independent(and non-optimized) kprobe - * (not an aggrprobe). Remove from the hash list. - */ - goto disarmed; + WARN_ON(ap != p && !kprobe_aggrprobe(ap)); - /* Following process expects this probe is an aggrprobe */ - WARN_ON(!kprobe_aggrprobe(ap)); - - if (list_is_singular(&ap->list) && kprobe_disarmed(ap)) + /* + * If the probe is an independent(and non-optimized) kprobe + * (not an aggrprobe), the last kprobe on the aggrprobe, or + * kprobe is already disarmed, just remove from the hash list. + */ + if (ap == p || + (list_is_singular(&ap->list) && kprobe_disarmed(ap))) { /* * !disarmed could be happen if the probe is under delayed * unoptimizing. */ - goto disarmed; - else { - /* If disabling probe has special handlers, update aggrprobe */ - if (p->post_handler && !kprobe_gone(p)) { - list_for_each_entry(list_p, &ap->list, list) { - if ((list_p != p) && (list_p->post_handler)) - goto noclean; - } + hlist_del_rcu(&ap->hlist); + return 0; + } + + /* If disabling probe has special handlers, update aggrprobe */ + if (p->post_handler && !kprobe_gone(p)) { + list_for_each_entry(list_p, &ap->list, list) { + if ((list_p != p) && (list_p->post_handler)) + break; + } + /* No other probe has post_handler */ + if (list_entry_is_head(list_p, &ap->list, list)) { /* * For the kprobe-on-ftrace case, we keep the * post_handler setting to identify this aggrprobe @@ -1792,24 +1763,21 @@ static int __unregister_kprobe_top(struct kprobe *p) if (!kprobe_ftrace(ap)) ap->post_handler = NULL; } -noclean: + } + + /* + * Remove from the aggrprobe: this path will do nothing in + * __unregister_kprobe_bottom(). + */ + list_del_rcu(&p->list); + if (!kprobe_disabled(ap) && !kprobes_all_disarmed) /* - * Remove from the aggrprobe: this path will do nothing in - * __unregister_kprobe_bottom(). + * Try to optimize this probe again, because post + * handler may have been changed. */ - list_del_rcu(&p->list); - if (!kprobe_disabled(ap) && !kprobes_all_disarmed) - /* - * Try to optimize this probe again, because post - * handler may have been changed. - */ - optimize_kprobe(ap); - } + optimize_kprobe(ap); return 0; -disarmed: - hlist_del_rcu(&ap->hlist); - return 0; } static void __unregister_kprobe_bottom(struct kprobe *p) @@ -1858,12 +1826,11 @@ void unregister_kprobes(struct kprobe **kps, int num) if (num <= 0) return; - mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) - if (__unregister_kprobe_top(kps[i]) < 0) - kps[i]->addr = NULL; - mutex_unlock(&kprobe_mutex); - + scoped_guard(mutex, &kprobe_mutex) { + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(kps[i]) < 0) + kps[i]->addr = NULL; + } synchronize_rcu(); for (i = 0; i < num; i++) if (kps[i]->addr) @@ -2302,8 +2269,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num) if (num <= 0) return; - mutex_lock(&kprobe_mutex); for (i = 0; i < num; i++) { + guard(mutex)(&kprobe_mutex); + if (__unregister_kprobe_top(&rps[i]->kp) < 0) rps[i]->kp.addr = NULL; #ifdef CONFIG_KRETPROBE_ON_RETHOOK @@ -2312,7 +2280,6 @@ void unregister_kretprobes(struct kretprobe **rps, int num) rcu_assign_pointer(rps[i]->rph->rp, NULL); #endif } - mutex_unlock(&kprobe_mutex); synchronize_rcu(); for (i = 0; i < num; i++) { @@ -2393,18 +2360,14 @@ static void kill_kprobe(struct kprobe *p) /* Disable one kprobe */ int disable_kprobe(struct kprobe *kp) { - int ret = 0; struct kprobe *p; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* Disable this kprobe */ p = __disable_kprobe(kp); - if (IS_ERR(p)) - ret = PTR_ERR(p); - mutex_unlock(&kprobe_mutex); - return ret; + return IS_ERR(p) ? PTR_ERR(p) : 0; } EXPORT_SYMBOL_GPL(disable_kprobe); @@ -2414,20 +2377,16 @@ int enable_kprobe(struct kprobe *kp) int ret = 0; struct kprobe *p; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* Check whether specified probe is valid. */ p = __get_valid_kprobe(kp); - if (unlikely(p == NULL)) { - ret = -EINVAL; - goto out; - } + if (unlikely(p == NULL)) + return -EINVAL; - if (kprobe_gone(kp)) { + if (kprobe_gone(kp)) /* This kprobe has gone, we couldn't enable it. */ - ret = -EINVAL; - goto out; - } + return -EINVAL; if (p != kp) kp->flags &= ~KPROBE_FLAG_DISABLED; @@ -2441,8 +2400,6 @@ int enable_kprobe(struct kprobe *kp) kp->flags |= KPROBE_FLAG_DISABLED; } } -out: - mutex_unlock(&kprobe_mutex); return ret; } EXPORT_SYMBOL_GPL(enable_kprobe); @@ -2630,11 +2587,11 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); - if (val == MODULE_STATE_COMING) { - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); + + if (val == MODULE_STATE_COMING) add_module_kprobe_blacklist(mod); - mutex_unlock(&kprobe_mutex); - } + if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2644,7 +2601,6 @@ static int kprobes_module_callback(struct notifier_block *nb, * notified, only '.init.text' section would be freed. We need to * disable kprobes which have been inserted in the sections. */ - mutex_lock(&kprobe_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry(p, head, hlist) @@ -2667,7 +2623,6 @@ static int kprobes_module_callback(struct notifier_block *nb, } if (val == MODULE_STATE_GOING) remove_module_kprobe_blacklist(mod); - mutex_unlock(&kprobe_mutex); return NOTIFY_DONE; } @@ -2695,7 +2650,7 @@ void kprobe_free_init_mem(void) struct kprobe *p; int i; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* Kill all kprobes on initmem because the target code has been freed. */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { @@ -2705,8 +2660,6 @@ void kprobe_free_init_mem(void) kill_kprobe(p); } } - - mutex_unlock(&kprobe_mutex); } static int __init init_kprobes(void) @@ -2902,11 +2855,11 @@ static int arm_all_kprobes(void) unsigned int i, total = 0, errors = 0; int err, ret = 0; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* If kprobes are armed, just return */ if (!kprobes_all_disarmed) - goto already_enabled; + return 0; /* * optimize_kprobe() called by arm_kprobe() checks @@ -2936,8 +2889,6 @@ static int arm_all_kprobes(void) else pr_info("Kprobes globally enabled\n"); -already_enabled: - mutex_unlock(&kprobe_mutex); return ret; } @@ -2948,13 +2899,11 @@ static int disarm_all_kprobes(void) unsigned int i, total = 0, errors = 0; int err, ret = 0; - mutex_lock(&kprobe_mutex); + guard(mutex)(&kprobe_mutex); /* If kprobes are already disarmed, just return */ - if (kprobes_all_disarmed) { - mutex_unlock(&kprobe_mutex); + if (kprobes_all_disarmed) return 0; - } kprobes_all_disarmed = true; @@ -2979,11 +2928,8 @@ static int disarm_all_kprobes(void) else pr_info("Kprobes globally disabled\n"); - mutex_unlock(&kprobe_mutex); - /* Wait for disarming all kprobes by optimizer */ - wait_for_kprobe_optimizer(); - + wait_for_kprobe_optimizer_locked(); return ret; } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 1bab21b4718f..eefb67d9883c 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -239,21 +239,7 @@ extern const void __start_notes; extern const void __stop_notes; #define notes_size (&__stop_notes - &__start_notes) -static ssize_t notes_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t count) -{ - memcpy(buf, &__start_notes + off, count); - return count; -} - -static struct bin_attribute notes_attr __ro_after_init = { - .attr = { - .name = "notes", - .mode = S_IRUGO, - }, - .read = ¬es_read, -}; +static __ro_after_init BIN_ATTR_SIMPLE_RO(notes); struct kobject *kernel_kobj; EXPORT_SYMBOL_GPL(kernel_kobj); @@ -307,8 +293,9 @@ static int __init ksysfs_init(void) goto kset_exit; if (notes_size > 0) { - notes_attr.size = notes_size; - error = sysfs_create_bin_file(kernel_kobj, ¬es_attr); + bin_attr_notes.private = (void *)&__start_notes; + bin_attr_notes.size = notes_size; + error = sysfs_create_bin_file(kernel_kobj, &bin_attr_notes); if (error) goto group_exit; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 1eb6f62a9165..5dc5b0d7238e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -35,6 +35,9 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; +static LIST_HEAD(kthreads_hotplug); +static DEFINE_MUTEX(kthreads_hotplug_lock); + struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ @@ -53,6 +56,8 @@ struct kthread_create_info struct kthread { unsigned long flags; unsigned int cpu; + unsigned int node; + int started; int result; int (*threadfn)(void *); void *data; @@ -63,6 +68,9 @@ struct kthread { #endif /* To store the full name if task comm is truncated. */ char *full_name; + struct task_struct *task; + struct list_head hotplug_node; + struct cpumask *preferred_affinity; }; enum KTHREAD_BITS { @@ -121,8 +129,11 @@ bool set_kthread_struct(struct task_struct *p) init_completion(&kthread->exited); init_completion(&kthread->parked); + INIT_LIST_HEAD(&kthread->hotplug_node); p->vfork_done = &kthread->exited; + kthread->task = p; + kthread->node = tsk_fork_get_node(current); p->worker_private = kthread; return true; } @@ -313,6 +324,16 @@ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; + if (!list_empty(&kthread->hotplug_node)) { + mutex_lock(&kthreads_hotplug_lock); + list_del(&kthread->hotplug_node); + mutex_unlock(&kthreads_hotplug_lock); + + if (kthread->preferred_affinity) { + kfree(kthread->preferred_affinity); + kthread->preferred_affinity = NULL; + } + } do_exit(0); } EXPORT_SYMBOL(kthread_exit); @@ -338,6 +359,56 @@ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) } EXPORT_SYMBOL(kthread_complete_and_exit); +static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask) +{ + const struct cpumask *pref; + + if (kthread->preferred_affinity) { + pref = kthread->preferred_affinity; + } else { + if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE)) + return; + pref = cpumask_of_node(kthread->node); + } + + cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD)); + if (cpumask_empty(cpumask)) + cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); +} + +static void kthread_affine_node(void) +{ + struct kthread *kthread = to_kthread(current); + cpumask_var_t affinity; + + WARN_ON_ONCE(kthread_is_per_cpu(current)); + + if (kthread->node == NUMA_NO_NODE) { + housekeeping_affine(current, HK_TYPE_KTHREAD); + } else { + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { + WARN_ON_ONCE(1); + return; + } + + mutex_lock(&kthreads_hotplug_lock); + WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); + list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); + /* + * The node cpumask is racy when read from kthread() but: + * - a racing CPU going down will either fail on the subsequent + * call to set_cpus_allowed_ptr() or be migrated to housekeepers + * afterwards by the scheduler. + * - a racing CPU going up will be handled by kthreads_online_cpu() + */ + kthread_fetch_affinity(kthread, affinity); + set_cpus_allowed_ptr(current, affinity); + mutex_unlock(&kthreads_hotplug_lock); + + free_cpumask_var(affinity); + } +} + static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; @@ -368,7 +439,6 @@ static int kthread(void *_create) * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD)); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); @@ -382,6 +452,11 @@ static int kthread(void *_create) schedule_preempt_disabled(); preempt_enable(); + self->started = 1; + + if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity) + kthread_affine_node(); + ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); @@ -540,7 +615,9 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { + struct kthread *kthread = to_kthread(p); __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); + WARN_ON_ONCE(kthread->started); } /** @@ -554,7 +631,9 @@ void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) */ void kthread_bind(struct task_struct *p, unsigned int cpu) { + struct kthread *kthread = to_kthread(p); __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); + WARN_ON_ONCE(kthread->started); } EXPORT_SYMBOL(kthread_bind); @@ -775,6 +854,92 @@ int kthreadd(void *unused) return 0; } +int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) +{ + struct kthread *kthread = to_kthread(p); + cpumask_var_t affinity; + unsigned long flags; + int ret = 0; + + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { + WARN_ON(1); + return -EINVAL; + } + + WARN_ON_ONCE(kthread->preferred_affinity); + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return -ENOMEM; + + kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL); + if (!kthread->preferred_affinity) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&kthreads_hotplug_lock); + cpumask_copy(kthread->preferred_affinity, mask); + WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); + list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); + kthread_fetch_affinity(kthread, affinity); + + /* It's safe because the task is inactive. */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + do_set_cpus_allowed(p, affinity); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + mutex_unlock(&kthreads_hotplug_lock); +out: + free_cpumask_var(affinity); + + return ret; +} + +/* + * Re-affine kthreads according to their preferences + * and the newly online CPU. The CPU down part is handled + * by select_fallback_rq() which default re-affines to + * housekeepers from other nodes in case the preferred + * affinity doesn't apply anymore. + */ +static int kthreads_online_cpu(unsigned int cpu) +{ + cpumask_var_t affinity; + struct kthread *k; + int ret; + + guard(mutex)(&kthreads_hotplug_lock); + + if (list_empty(&kthreads_hotplug)) + return 0; + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return -ENOMEM; + + ret = 0; + + list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { + if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || + kthread_is_per_cpu(k->task))) { + ret = -EINVAL; + continue; + } + kthread_fetch_affinity(k, affinity); + set_cpus_allowed_ptr(k->task, affinity); + } + + free_cpumask_var(affinity); + + return ret; +} + +static int kthreads_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online", + kthreads_online_cpu, NULL); +} +early_initcall(kthreads_init); + void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) @@ -866,12 +1031,11 @@ repeat: EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * -__kthread_create_worker(int cpu, unsigned int flags, - const char namefmt[], va_list args) +__kthread_create_worker_on_node(unsigned int flags, int node, + const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; - int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) @@ -879,20 +1043,14 @@ __kthread_create_worker(int cpu, unsigned int flags, kthread_init_worker(worker); - if (cpu >= 0) - node = cpu_to_node(cpu); - task = __kthread_create_on_node(kthread_worker_fn, worker, - node, namefmt, args); + node, namefmt, args); if (IS_ERR(task)) goto fail_task; - if (cpu >= 0) - kthread_bind(task, cpu); - worker->flags = flags; worker->task = task; - wake_up_process(task); + return worker; fail_task: @@ -901,8 +1059,9 @@ fail_task: } /** - * kthread_create_worker - create a kthread worker + * kthread_create_worker_on_node - create a kthread worker * @flags: flags modifying the default behavior of the worker + * @node: task structure for the thread is allocated on this node * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) @@ -910,25 +1069,26 @@ fail_task: * when the caller was killed by a fatal signal. */ struct kthread_worker * -kthread_create_worker(unsigned int flags, const char namefmt[], ...) +kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); - worker = __kthread_create_worker(-1, flags, namefmt, args); + worker = __kthread_create_worker_on_node(flags, node, namefmt, args); va_end(args); return worker; } -EXPORT_SYMBOL(kthread_create_worker); +EXPORT_SYMBOL(kthread_create_worker_on_node); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker - * @namefmt: printf-style name for the kthread worker (task). + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. @@ -960,14 +1120,13 @@ EXPORT_SYMBOL(kthread_create_worker); */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, - const char namefmt[], ...) + const char namefmt[]) { struct kthread_worker *worker; - va_list args; - va_start(args, namefmt); - worker = __kthread_create_worker(cpu, flags, namefmt, args); - va_end(args); + worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu); + if (!IS_ERR(worker)) + kthread_bind(worker->task, cpu); return worker; } @@ -1016,7 +1175,7 @@ static void kthread_insert_work(struct kthread_worker *worker, * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task - * must have been created with kthread_worker_create(). Returns %true + * must have been created with kthread_create_worker(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 7a75eab9c179..d4281d1e13a6 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -77,7 +77,7 @@ static int sysctl_latencytop(const struct ctl_table *table, int write, void *buf return err; } -static struct ctl_table latencytop_sysctl[] = { +static const struct ctl_table latencytop_sysctl[] = { { .procname = "latencytop", .data = &latencytop_enabled, @@ -158,9 +158,9 @@ account_global_scheduler_latency(struct task_struct *tsk, /** * __account_scheduler_latency - record an occurred latency - * @tsk - the task struct of the task hitting the latency - * @usecs - the duration of the latency in microseconds - * @inter - 1 if the sleep was interruptible, 0 if uninterruptible + * @tsk: the task struct of the task hitting the latency + * @usecs: the duration of the latency in microseconds + * @inter: 1 if the sleep was interruptible, 0 if uninterruptible * * This function is the main entry point for recording latency entries * as called by the scheduler. diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 3c21c31796db..0cd39954d5a1 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -347,6 +347,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, * /sys/kernel/livepatch/<patch>/transition * /sys/kernel/livepatch/<patch>/force * /sys/kernel/livepatch/<patch>/replace + * /sys/kernel/livepatch/<patch>/stack_order * /sys/kernel/livepatch/<patch>/<object> * /sys/kernel/livepatch/<patch>/<object>/patched * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> @@ -452,15 +453,38 @@ static ssize_t replace_show(struct kobject *kobj, return sysfs_emit(buf, "%d\n", patch->replace); } +static ssize_t stack_order_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct klp_patch *patch, *this_patch; + int stack_order = 0; + + this_patch = container_of(kobj, struct klp_patch, kobj); + + mutex_lock(&klp_mutex); + + klp_for_each_patch(patch) { + stack_order++; + if (patch == this_patch) + break; + } + + mutex_unlock(&klp_mutex); + + return sysfs_emit(buf, "%d\n", stack_order); +} + static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); static struct kobj_attribute force_kobj_attr = __ATTR_WO(force); static struct kobj_attribute replace_kobj_attr = __ATTR_RO(replace); +static struct kobj_attribute stack_order_kobj_attr = __ATTR_RO(stack_order); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, &transition_kobj_attr.attr, &force_kobj_attr.attr, &replace_kobj_attr.attr, + &stack_order_kobj_attr.attr, NULL }; ATTRIBUTE_GROUPS(klp_patch); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 0db4093d17b8..a114949eeed5 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,7 +5,8 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance. +KASAN_SANITIZE_lockdep.o := n KCSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 97fb6f3f840a..9ef9850aeebe 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -67,3 +67,31 @@ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ + +/* + * Locking events for rtlock_slowlock() + */ +LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */ +LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */ +LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */ + +/* + * Locking events for rt_mutex_slowlock() + */ +LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */ +LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */ +LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */ +LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */ +LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */ +LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */ +LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */ +LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */ + +/* + * Locking events for lockdep + */ +LOCK_EVENT(lockdep_acquire) +LOCK_EVENT(lockdep_lock) +LOCK_EVENT(lockdep_nocheck) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2d8ec0351ef9..b15757e63626 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -57,10 +57,12 @@ #include <linux/lockdep.h> #include <linux/context_tracking.h> #include <linux/console.h> +#include <linux/kasan.h> #include <asm/sections.h> #include "lockdep_internals.h" +#include "lock_events.h" #include <trace/events/lock.h> @@ -79,7 +81,7 @@ module_param(lock_stat, int, 0644); #endif #ifdef CONFIG_SYSCTL -static struct ctl_table kern_lockdep_table[] = { +static const struct ctl_table kern_lockdep_table[] = { #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", @@ -157,10 +159,12 @@ static inline void lockdep_unlock(void) __this_cpu_dec(lockdep_recursion); } +#ifdef CONFIG_PROVE_LOCKING static inline bool lockdep_assert_locked(void) { return DEBUG_LOCKS_WARN_ON(__owner != current); } +#endif static struct task_struct *lockdep_selftest_task_struct; @@ -168,6 +172,7 @@ static struct task_struct *lockdep_selftest_task_struct; static int graph_lock(void) { lockdep_lock(); + lockevent_inc(lockdep_lock); /* * Make sure that if another CPU detected a bug while * walking the graph we dont change it (while the other @@ -430,7 +435,7 @@ static inline u16 hlock_id(struct held_lock *hlock) return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS)); } -static inline unsigned int chain_hlock_class_idx(u16 hlock_id) +static inline __maybe_unused unsigned int chain_hlock_class_idx(u16 hlock_id) { return hlock_id & (MAX_LOCKDEP_KEYS - 1); } @@ -5089,8 +5094,12 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (unlikely(lock->key == &__lockdep_no_track__)) return 0; - if (!prove_locking || lock->key == &__lockdep_no_validate__) + lockevent_inc(lockdep_acquire); + + if (!prove_locking || lock->key == &__lockdep_no_validate__) { check = 0; + lockevent_inc(lockdep_nocheck); + } if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -5822,6 +5831,14 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!debug_locks) return; + /* + * As KASAN instrumentation is disabled and lock_acquire() is usually + * the first lockdep call when a task tries to acquire a lock, add + * kasan_check_byte() here to check for use-after-free and other + * memory errors. + */ + kasan_check_byte(lock); + if (unlikely(!lockdep_enabled())) { /* XXX allow trylock from NMI ?!? */ if (lockdep_nmi() && !trylock) { diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index bbe9000260d0..20f9ef58d3d0 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -119,7 +119,8 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) -#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) +#define AVG_LOCKDEP_CHAIN_DEPTH 5 +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS * AVG_LOCKDEP_CHAIN_DEPTH) extern struct lock_chain lock_chains[]; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index de95ec07e477..cc33470f4de9 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -106,7 +106,7 @@ static const struct kernel_param_ops lt_bind_ops = { module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0644); module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0644); -long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn); static struct task_struct *stats_task; static struct task_struct **writer_tasks; @@ -1358,7 +1358,7 @@ static int __init lock_torture_init(void) if (torture_init_error(firsterr)) goto unwind; if (cpumask_nonempty(bind_writers)) - torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers); + torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers, true); create_reader: if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) @@ -1369,7 +1369,7 @@ static int __init lock_torture_init(void) if (torture_init_error(firsterr)) goto unwind; if (cpumask_nonempty(bind_readers)) - torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers); + torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers, true); } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 3302e52f0c96..19b636f60a24 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -143,6 +143,8 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) unsigned long curr = (unsigned long)current; unsigned long zero = 0UL; + MUTEX_WARN_ON(lock->magic != lock); + if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr)) return true; @@ -657,10 +659,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err; } - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - /* Make sure we do wakeups before calling schedule */ - wake_up_q(&wake_q); - wake_q_init(&wake_q); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); schedule_preempt_disabled(); @@ -710,8 +709,7 @@ skip_wait: if (ww_ctx) ww_mutex_lock_acquired(ww, ww_ctx); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); preempt_enable(); return 0; @@ -720,10 +718,9 @@ err: __mutex_remove_waiter(lock, &waiter); err_early_kill: trace_contention_end(lock, ret); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, ip); - wake_up_q(&wake_q); preempt_enable(); return ret; } @@ -935,10 +932,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - preempt_disable(); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); - preempt_enable(); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); } #ifndef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 697a56d3d949..c80902eacd79 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -27,6 +27,7 @@ #include <trace/events/lock.h> #include "rtmutex_common.h" +#include "lock_events.h" #ifndef WW_RT # define build_ww_mutex() (false) @@ -1292,13 +1293,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, */ get_task_struct(owner); - preempt_disable(); - raw_spin_unlock_irq(&lock->wait_lock); - /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */ - wake_up_q(wake_q); - wake_q_init(wake_q); - preempt_enable(); - + raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); @@ -1618,10 +1613,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct task_struct *owner; int ret = 0; + lockevent_inc(rtmutex_slow_block); for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) + if (try_to_take_rt_mutex(lock, current, waiter)) { + lockevent_inc(rtmutex_slow_acq3); break; + } if (timeout && !timeout->task) { ret = -ETIMEDOUT; @@ -1642,16 +1640,12 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = rt_mutex_owner(lock); else owner = NULL; - preempt_disable(); - raw_spin_unlock_irq(&lock->wait_lock); - if (wake_q) { - wake_up_q(wake_q); - wake_q_init(wake_q); - } - preempt_enable(); + raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) { + lockevent_inc(rtmutex_slow_sleep); rt_mutex_schedule(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @@ -1706,6 +1700,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, int ret; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtmutex_slowlock); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { @@ -1713,6 +1708,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq1); return 0; } @@ -1731,10 +1727,12 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); ww_mutex_lock_acquired(ww, ww_ctx); } + lockevent_inc(rtmutex_slow_acq2); } else { __set_current_state(TASK_RUNNING); remove_waiter(lock, waiter); rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); + lockevent_inc(rtmutex_deadlock); } /* @@ -1763,6 +1761,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, &waiter, wake_q); debug_rt_mutex_free_waiter(&waiter); + lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q)); return ret; } @@ -1799,10 +1798,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, */ raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q); - preempt_disable(); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); - preempt_enable(); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); rt_mutex_post_schedule(); return ret; @@ -1838,9 +1834,12 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, struct task_struct *owner; lockdep_assert_held(&lock->wait_lock); + lockevent_inc(rtlock_slowlock); - if (try_to_take_rt_mutex(lock, current, NULL)) + if (try_to_take_rt_mutex(lock, current, NULL)) { + lockevent_inc(rtlock_slow_acq1); return; + } rt_mutex_init_rtlock_waiter(&waiter); @@ -1853,21 +1852,21 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, for (;;) { /* Try to acquire the lock again */ - if (try_to_take_rt_mutex(lock, current, &waiter)) + if (try_to_take_rt_mutex(lock, current, &waiter)) { + lockevent_inc(rtlock_slow_acq2); break; + } if (&waiter == rt_mutex_top_waiter(lock)) owner = rt_mutex_owner(lock); else owner = NULL; - preempt_disable(); - raw_spin_unlock_irq(&lock->wait_lock); - wake_up_q(wake_q); - wake_q_init(wake_q); - preempt_enable(); + raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); - if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) + if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) { + lockevent_inc(rtlock_slow_sleep); schedule_rtlock(); + } raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_RTLOCK_WAIT); @@ -1884,6 +1883,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, debug_rt_mutex_free_waiter(&waiter); trace_contention_end(lock, 0); + lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q)); } static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) @@ -1893,10 +1893,7 @@ static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) raw_spin_lock_irqsave(&lock->wait_lock, flags); rtlock_slowlock_locked(lock, &wake_q); - preempt_disable(); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); - preempt_enable(); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); } #endif /* RT_MUTEX_BUILD_SPINLOCKS */ diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index c38a2d2d4a7e..78dd3d8c6554 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -59,8 +59,8 @@ struct rt_mutex_waiter { }; /** - * rt_wake_q_head - Wrapper around regular wake_q_head to support - * "sleeping" spinlocks on RT + * struct rt_wake_q_head - Wrapper around regular wake_q_head to support + * "sleeping" spinlocks on RT * @head: The regular wake_q_head for sleeping lock variants * @rtlock_task: Task pointer for RT lock (spin/rwlock) wakeups */ diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 34bfae72f295..de9117c0e671 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -29,6 +29,7 @@ #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> +#include <linux/sched/wake_q.h> #include <linux/semaphore.h> #include <linux/spinlock.h> #include <linux/ftrace.h> @@ -38,7 +39,7 @@ static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); static noinline int __down_killable(struct semaphore *sem); static noinline int __down_timeout(struct semaphore *sem, long timeout); -static noinline void __up(struct semaphore *sem); +static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q); /** * down - acquire the semaphore @@ -183,13 +184,16 @@ EXPORT_SYMBOL(down_timeout); void __sched up(struct semaphore *sem) { unsigned long flags; + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else - __up(sem); + __up(sem, &wake_q); raw_spin_unlock_irqrestore(&sem->lock, flags); + if (!wake_q_empty(&wake_q)) + wake_up_q(&wake_q); } EXPORT_SYMBOL(up); @@ -269,11 +273,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout); } -static noinline void __sched __up(struct semaphore *sem) +static noinline void __sched __up(struct semaphore *sem, + struct wake_q_head *wake_q) { struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, struct semaphore_waiter, list); list_del(&waiter->list); waiter->up = true; - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); } diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 5d58b2c0ef98..bcb1b9fea588 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -404,7 +404,7 @@ static inline u32 prandom_u32_below(u32 ceil) static int *get_random_order(int count) { int *order; - int n, r, tmp; + int n, r; order = kmalloc_array(count, sizeof(*order), GFP_KERNEL); if (!order) @@ -415,11 +415,8 @@ static int *get_random_order(int count) for (n = count - 1; n > 1; n--) { r = prandom_u32_below(n + 1); - if (r != n) { - tmp = order[n]; - order[n] = order[r]; - order[r] = tmp; - } + if (r != n) + swap(order[n], order[r]); } return order; diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 7b329057997a..d7762ef5949a 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -169,6 +169,36 @@ config MODVERSIONS make them incompatible with the kernel you are running. If unsure, say N. +choice + prompt "Module versioning implementation" + depends on MODVERSIONS + help + Select the tool used to calculate symbol versions for modules. + + If unsure, select GENKSYMS. + +config GENKSYMS + bool "genksyms (from source code)" + help + Calculate symbol versions from pre-processed source code using + genksyms. + + If unsure, say Y. + +config GENDWARFKSYMS + bool "gendwarfksyms (from debugging information)" + depends on DEBUG_INFO + # Requires full debugging information, split DWARF not supported. + depends on !DEBUG_INFO_REDUCED && !DEBUG_INFO_SPLIT + # Requires ELF object files. + depends on !LTO + help + Calculate symbol versions from DWARF debugging information using + gendwarfksyms. Requires DEBUG_INFO to be enabled. + + If unsure, say N. +endchoice + config ASM_MODVERSIONS bool default HAVE_ASM_MODVERSIONS && MODVERSIONS @@ -177,6 +207,31 @@ config ASM_MODVERSIONS assembly. This can be enabled only when the target architecture supports it. +config EXTENDED_MODVERSIONS + bool "Extended Module Versioning Support" + depends on MODVERSIONS + help + This enables extended MODVERSIONs support, allowing long symbol + names to be versioned. + + The most likely reason you would enable this is to enable Rust + support. If unsure, say N. + +config BASIC_MODVERSIONS + bool "Basic Module Versioning Support" + depends on MODVERSIONS + default y + help + This enables basic MODVERSIONS support, allowing older tools or + kernels to potentially load modules. + + Disabling this may cause older `modprobe` or `kmod` to be unable + to read MODVERSIONS information from built modules. With this + disabled, older kernels may treat this module as unversioned. + + This is enabled by default when MODVERSIONS are enabled. + If unsure, say Y. + config MODULE_SRCVERSION_ALL bool "Source checksum for all modules" help @@ -231,6 +286,7 @@ comment "Do not forget to sign required modules with scripts/sign-file" choice prompt "Hash algorithm to sign modules" depends on MODULE_SIG || IMA_APPRAISE_MODSIG + default MODULE_SIG_SHA512 help This determines which sort of hashing algorithm will be used during signature generation. This algorithm _must_ be built into the kernel diff --git a/kernel/module/internal.h b/kernel/module/internal.h index daef2be83902..d09b46ef032f 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -47,16 +47,16 @@ struct kernel_symbol { extern struct mutex module_mutex; extern struct list_head modules; -extern struct module_attribute *modinfo_attrs[]; -extern size_t modinfo_attrs_count; +extern const struct module_attribute *const modinfo_attrs[]; +extern const size_t modinfo_attrs_count; /* Provided by the linker */ extern const struct kernel_symbol __start___ksymtab[]; extern const struct kernel_symbol __stop___ksymtab[]; extern const struct kernel_symbol __start___ksymtab_gpl[]; extern const struct kernel_symbol __stop___ksymtab_gpl[]; -extern const s32 __start___kcrctab[]; -extern const s32 __start___kcrctab_gpl[]; +extern const u32 __start___kcrctab[]; +extern const u32 __start___kcrctab_gpl[]; struct load_info { const char *name; @@ -86,6 +86,8 @@ struct load_info { unsigned int vers; unsigned int info; unsigned int pcpu; + unsigned int vers_ext_crc; + unsigned int vers_ext_name; } index; }; @@ -102,7 +104,7 @@ struct find_symbol_arg { /* Output */ struct module *owner; - const s32 *crc; + const u32 *crc; const struct kernel_symbol *sym; enum mod_license license; }; @@ -327,7 +329,8 @@ static inline struct module *mod_find(unsigned long addr, struct mod_tree_root * } #endif /* CONFIG_MODULES_TREE_LOOKUP */ -int module_enable_rodata_ro(const struct module *mod, bool after_init); +int module_enable_rodata_ro(const struct module *mod); +int module_enable_rodata_ro_after_init(const struct module *mod); int module_enable_data_nx(const struct module *mod); int module_enable_text_rox(const struct module *mod); int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, @@ -384,16 +387,25 @@ static inline void init_param_lock(struct module *mod) { } #ifdef CONFIG_MODVERSIONS int check_version(const struct load_info *info, - const char *symname, struct module *mod, const s32 *crc); + const char *symname, struct module *mod, const u32 *crc); void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, struct kernel_symbol *ks, struct tracepoint * const *tp); int check_modstruct_version(const struct load_info *info, struct module *mod); int same_magic(const char *amagic, const char *bmagic, bool has_crcs); +struct modversion_info_ext { + size_t remaining; + const u32 *crc; + const char *name; +}; +void modversion_ext_start(const struct load_info *info, struct modversion_info_ext *ver); +void modversion_ext_advance(struct modversion_info_ext *ver); +#define for_each_modversion_info_ext(ver, info) \ + for (modversion_ext_start(info, &ver); ver.remaining > 0; modversion_ext_advance(&ver)) #else /* !CONFIG_MODVERSIONS */ static inline int check_version(const struct load_info *info, const char *symname, struct module *mod, - const s32 *crc) + const u32 *crc) { return 1; } diff --git a/kernel/module/main.c b/kernel/module/main.c index 5399c182b3cb..a256cc919ad7 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -86,7 +86,7 @@ struct mod_tree_root mod_tree __cacheline_aligned = { struct symsearch { const struct kernel_symbol *start, *stop; - const s32 *crcs; + const u32 *crcs; enum mod_license license; }; @@ -538,7 +538,7 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \ { \ mod->field = kstrdup(s, GFP_KERNEL); \ } \ -static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ +static ssize_t show_modinfo_##field(const struct module_attribute *mattr, \ struct module_kobject *mk, char *buffer) \ { \ return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \ @@ -552,7 +552,7 @@ static void free_modinfo_##field(struct module *mod) \ kfree(mod->field); \ mod->field = NULL; \ } \ -static struct module_attribute modinfo_##field = { \ +static const struct module_attribute modinfo_##field = { \ .attr = { .name = __stringify(field), .mode = 0444 }, \ .show = show_modinfo_##field, \ .setup = setup_modinfo_##field, \ @@ -842,13 +842,13 @@ void symbol_put_addr(void *addr) } EXPORT_SYMBOL_GPL(symbol_put_addr); -static ssize_t show_refcnt(struct module_attribute *mattr, +static ssize_t show_refcnt(const struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { return sprintf(buffer, "%i\n", module_refcount(mk->mod)); } -static struct module_attribute modinfo_refcnt = +static const struct module_attribute modinfo_refcnt = __ATTR(refcnt, 0444, show_refcnt, NULL); void __module_get(struct module *module) @@ -917,7 +917,7 @@ size_t module_flags_taint(unsigned long taints, char *buf) return l; } -static ssize_t show_initstate(struct module_attribute *mattr, +static ssize_t show_initstate(const struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { const char *state = "unknown"; @@ -938,10 +938,10 @@ static ssize_t show_initstate(struct module_attribute *mattr, return sprintf(buffer, "%s\n", state); } -static struct module_attribute modinfo_initstate = +static const struct module_attribute modinfo_initstate = __ATTR(initstate, 0444, show_initstate, NULL); -static ssize_t store_uevent(struct module_attribute *mattr, +static ssize_t store_uevent(const struct module_attribute *mattr, struct module_kobject *mk, const char *buffer, size_t count) { @@ -951,10 +951,10 @@ static ssize_t store_uevent(struct module_attribute *mattr, return rc ? rc : count; } -struct module_attribute module_uevent = +const struct module_attribute module_uevent = __ATTR(uevent, 0200, NULL, store_uevent); -static ssize_t show_coresize(struct module_attribute *mattr, +static ssize_t show_coresize(const struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { unsigned int size = mk->mod->mem[MOD_TEXT].size; @@ -966,11 +966,11 @@ static ssize_t show_coresize(struct module_attribute *mattr, return sprintf(buffer, "%u\n", size); } -static struct module_attribute modinfo_coresize = +static const struct module_attribute modinfo_coresize = __ATTR(coresize, 0444, show_coresize, NULL); #ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC -static ssize_t show_datasize(struct module_attribute *mattr, +static ssize_t show_datasize(const struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { unsigned int size = 0; @@ -980,11 +980,11 @@ static ssize_t show_datasize(struct module_attribute *mattr, return sprintf(buffer, "%u\n", size); } -static struct module_attribute modinfo_datasize = +static const struct module_attribute modinfo_datasize = __ATTR(datasize, 0444, show_datasize, NULL); #endif -static ssize_t show_initsize(struct module_attribute *mattr, +static ssize_t show_initsize(const struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { unsigned int size = 0; @@ -994,10 +994,10 @@ static ssize_t show_initsize(struct module_attribute *mattr, return sprintf(buffer, "%u\n", size); } -static struct module_attribute modinfo_initsize = +static const struct module_attribute modinfo_initsize = __ATTR(initsize, 0444, show_initsize, NULL); -static ssize_t show_taint(struct module_attribute *mattr, +static ssize_t show_taint(const struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { size_t l; @@ -1007,10 +1007,10 @@ static ssize_t show_taint(struct module_attribute *mattr, return l; } -static struct module_attribute modinfo_taint = +static const struct module_attribute modinfo_taint = __ATTR(taint, 0444, show_taint, NULL); -struct module_attribute *modinfo_attrs[] = { +const struct module_attribute *const modinfo_attrs[] = { &module_uevent, &modinfo_version, &modinfo_srcversion, @@ -1027,7 +1027,7 @@ struct module_attribute *modinfo_attrs[] = { NULL, }; -size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs); +const size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs); static const char vermagic[] = VERMAGIC_STRING; @@ -1221,18 +1221,6 @@ void __weak module_arch_freeing_init(struct module *mod) { } -void *__module_writable_address(struct module *mod, void *loc) -{ - for_class_mod_mem_type(type, text) { - struct module_memory *mem = &mod->mem[type]; - - if (loc >= mem->base && loc < mem->base + mem->size) - return loc + (mem->rw_copy - mem->base); - } - - return loc; -} - static int module_memory_alloc(struct module *mod, enum mod_mem_type type) { unsigned int size = PAGE_ALIGN(mod->mem[type].size); @@ -1250,21 +1238,15 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) if (!ptr) return -ENOMEM; - mod->mem[type].base = ptr; - if (execmem_is_rox(execmem_type)) { - ptr = vzalloc(size); + int err = execmem_make_temp_rw(ptr, size); - if (!ptr) { - execmem_free(mod->mem[type].base); + if (err) { + execmem_free(ptr); return -ENOMEM; } - mod->mem[type].rw_copy = ptr; mod->mem[type].is_rox = true; - } else { - mod->mem[type].rw_copy = mod->mem[type].base; - memset(mod->mem[type].base, 0, size); } /* @@ -1278,18 +1260,29 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) * *do* eventually get freed, but let's just keep things simple * and avoid *any* false positives. */ - kmemleak_not_leak(ptr); + if (!mod->mem[type].is_rox) + kmemleak_not_leak(ptr); + + memset(ptr, 0, size); + mod->mem[type].base = ptr; return 0; } +static void module_memory_restore_rox(struct module *mod) +{ + for_class_mod_mem_type(type, text) { + struct module_memory *mem = &mod->mem[type]; + + if (mem->is_rox) + execmem_restore_rox(mem->base, mem->size); + } +} + static void module_memory_free(struct module *mod, enum mod_mem_type type) { struct module_memory *mem = &mod->mem[type]; - if (mem->is_rox) - vfree(mem->rw_copy); - execmem_free(mem->base); } @@ -1681,7 +1674,7 @@ static void module_license_taint_check(struct module *mod, const char *license) static void setup_modinfo(struct module *mod, struct load_info *info) { - struct module_attribute *attr; + const struct module_attribute *attr; int i; for (i = 0; (attr = modinfo_attrs[i]); i++) { @@ -1692,7 +1685,7 @@ static void setup_modinfo(struct module *mod, struct load_info *info) static void free_modinfo(struct module *mod) { - struct module_attribute *attr; + const struct module_attribute *attr; int i; for (i = 0; (attr = modinfo_attrs[i]); i++) { @@ -2074,6 +2067,82 @@ static int elf_validity_cache_index_str(struct load_info *info) } /** + * elf_validity_cache_index_versions() - Validate and cache version indices + * @info: Load info to cache version indices in. + * Must have &load_info->sechdrs and &load_info->secstrings populated. + * @flags: Load flags, relevant to suppress version loading, see + * uapi/linux/module.h + * + * If we're ignoring modversions based on @flags, zero all version indices + * and return validity. Othewrise check: + * + * * If "__version_ext_crcs" is present, "__version_ext_names" is present + * * There is a name present for every crc + * + * Then populate: + * + * * &load_info->index.vers + * * &load_info->index.vers_ext_crc + * * &load_info->index.vers_ext_names + * + * if present. + * + * Return: %0 if valid, %-ENOEXEC on failure. + */ +static int elf_validity_cache_index_versions(struct load_info *info, int flags) +{ + unsigned int vers_ext_crc; + unsigned int vers_ext_name; + size_t crc_count; + size_t remaining_len; + size_t name_size; + char *name; + + /* If modversions were suppressed, pretend we didn't find any */ + if (flags & MODULE_INIT_IGNORE_MODVERSIONS) { + info->index.vers = 0; + info->index.vers_ext_crc = 0; + info->index.vers_ext_name = 0; + return 0; + } + + vers_ext_crc = find_sec(info, "__version_ext_crcs"); + vers_ext_name = find_sec(info, "__version_ext_names"); + + /* If we have one field, we must have the other */ + if (!!vers_ext_crc != !!vers_ext_name) { + pr_err("extended version crc+name presence does not match"); + return -ENOEXEC; + } + + /* + * If we have extended version information, we should have the same + * number of entries in every section. + */ + if (vers_ext_crc) { + crc_count = info->sechdrs[vers_ext_crc].sh_size / sizeof(u32); + name = (void *)info->hdr + + info->sechdrs[vers_ext_name].sh_offset; + remaining_len = info->sechdrs[vers_ext_name].sh_size; + + while (crc_count--) { + name_size = strnlen(name, remaining_len) + 1; + if (name_size > remaining_len) { + pr_err("more extended version crcs than names"); + return -ENOEXEC; + } + remaining_len -= name_size; + name += name_size; + } + } + + info->index.vers = find_sec(info, "__versions"); + info->index.vers_ext_crc = vers_ext_crc; + info->index.vers_ext_name = vers_ext_name; + return 0; +} + +/** * elf_validity_cache_index() - Resolve, validate, cache section indices * @info: Load info to read from and update. * &load_info->sechdrs and &load_info->secstrings must be populated. @@ -2087,9 +2156,7 @@ static int elf_validity_cache_index_str(struct load_info *info) * * elf_validity_cache_index_mod() * * elf_validity_cache_index_sym() * * elf_validity_cache_index_str() - * - * If versioning is not suppressed via flags, load the version index from - * a section called "__versions" with no validation. + * * elf_validity_cache_index_versions() * * If CONFIG_SMP is enabled, load the percpu section by name with no * validation. @@ -2112,11 +2179,9 @@ static int elf_validity_cache_index(struct load_info *info, int flags) err = elf_validity_cache_index_str(info); if (err < 0) return err; - - if (flags & MODULE_INIT_IGNORE_MODVERSIONS) - info->index.vers = 0; /* Pretend no __versions section! */ - else - info->index.vers = find_sec(info, "__versions"); + err = elf_validity_cache_index_versions(info, flags); + if (err < 0) + return err; info->index.pcpu = find_pcpusec(info); @@ -2327,16 +2392,29 @@ static int rewrite_section_headers(struct load_info *info, int flags) /* Track but don't keep modinfo and version sections. */ info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->sechdrs[info->index.vers_ext_crc].sh_flags &= + ~(unsigned long)SHF_ALLOC; + info->sechdrs[info->index.vers_ext_name].sh_flags &= + ~(unsigned long)SHF_ALLOC; info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; return 0; } +static const char *const module_license_offenders[] = { + /* driverloader was caught wrongly pretending to be under GPL */ + "driverloader", + + /* lve claims to be GPL but upstream won't provide source */ + "lve", +}; + /* * These calls taint the kernel depending certain module circumstances */ static void module_augment_kernel_taints(struct module *mod, struct load_info *info) { int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE); + size_t i; if (!get_modinfo(info, "intree")) { if (!test_taint(TAINT_OOT_MODULE)) @@ -2385,15 +2463,11 @@ static void module_augment_kernel_taints(struct module *mod, struct load_info *i if (strcmp(mod->name, "ndiswrapper") == 0) add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); - /* driverloader was caught wrongly pretending to be under GPL */ - if (strcmp(mod->name, "driverloader") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE, - LOCKDEP_NOW_UNRELIABLE); - - /* lve claims to be GPL but upstream won't provide source */ - if (strcmp(mod->name, "lve") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE, - LOCKDEP_NOW_UNRELIABLE); + for (i = 0; i < ARRAY_SIZE(module_license_offenders); ++i) { + if (strcmp(mod->name, module_license_offenders[i]) == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE, + LOCKDEP_NOW_UNRELIABLE); + } if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE)) pr_warn("%s: module license taints kernel.\n", mod->name); @@ -2561,7 +2635,6 @@ static int move_module(struct module *mod, struct load_info *info) for_each_mod_mem_type(type) { if (!mod->mem[type].size) { mod->mem[type].base = NULL; - mod->mem[type].rw_copy = NULL; continue; } @@ -2578,7 +2651,6 @@ static int move_module(struct module *mod, struct load_info *info) void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; const char *sname; - unsigned long addr; if (!(shdr->sh_flags & SHF_ALLOC)) continue; @@ -2599,14 +2671,12 @@ static int move_module(struct module *mod, struct load_info *info) ret = PTR_ERR(dest); goto out_err; } - addr = (unsigned long)dest; codetag_section_found = true; } else { enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; - addr = (unsigned long)mod->mem[type].base + offset; - dest = mod->mem[type].rw_copy + offset; + dest = mod->mem[type].base + offset; } if (shdr->sh_type != SHT_NOBITS) { @@ -2629,13 +2699,14 @@ static int move_module(struct module *mod, struct load_info *info) * users of info can keep taking advantage and using the newly * minted official memory area. */ - shdr->sh_addr = addr; + shdr->sh_addr = (unsigned long)dest; pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr, (long)shdr->sh_size, info->secstrings + shdr->sh_name); } return 0; out_err: + module_memory_restore_rox(mod); for (t--; t >= 0; t--) module_memory_free(mod, t); if (codetag_section_found) @@ -2782,17 +2853,8 @@ int __weak module_finalize(const Elf_Ehdr *hdr, return 0; } -int __weak module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - return 0; -} - static int post_relocation(struct module *mod, const struct load_info *info) { - int ret; - /* Sort exception table now relocations are done. */ sort_extable(mod->extable, mod->extable + mod->num_exentries); @@ -2804,24 +2866,7 @@ static int post_relocation(struct module *mod, const struct load_info *info) add_kallsyms(mod, info); /* Arch-specific module finalizing. */ - ret = module_finalize(info->hdr, info->sechdrs, mod); - if (ret) - return ret; - - for_each_mod_mem_type(type) { - struct module_memory *mem = &mod->mem[type]; - - if (mem->is_rox) { - if (!execmem_update_copy(mem->base, mem->rw_copy, - mem->size)) - return -ENOMEM; - - vfree(mem->rw_copy); - mem->rw_copy = NULL; - } - } - - return module_post_finalize(info->hdr, info->sechdrs, mod); + return module_finalize(info->hdr, info->sechdrs, mod); } /* Call module constructors. */ @@ -2948,9 +2993,12 @@ static noinline int do_init_module(struct module *mod) /* Switch to core kallsyms now init is done: kallsyms may be walking! */ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif - ret = module_enable_rodata_ro(mod, true); + ret = module_enable_rodata_ro_after_init(mod); if (ret) - goto fail_mutex_unlock; + pr_warn("%s: module_enable_rodata_ro_after_init() returned %d, " + "ro_after_init data might still be writable\n", + mod->name, ret); + mod_tree_remove_init(mod); module_arch_freeing_init(mod); for_class_mod_mem_type(type, init) { @@ -2989,8 +3037,6 @@ static noinline int do_init_module(struct module *mod) return 0; -fail_mutex_unlock: - mutex_unlock(&module_mutex); fail_free_freeinit: kfree(freeinit); fail: @@ -3118,7 +3164,7 @@ static int complete_formation(struct module *mod, struct load_info *info) module_bug_finalize(info->hdr, info->sechdrs, mod); module_cfi_finalize(info->hdr, info->sechdrs, mod); - err = module_enable_rodata_ro(mod, false); + err = module_enable_rodata_ro(mod); if (err) goto out_strict_rwx; err = module_enable_data_nx(mod); @@ -3417,6 +3463,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod->mem[type].size); } + module_memory_restore_rox(mod); module_deallocate(mod, info); free_copy: /* diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index 239e5013359d..03f4142cfbf4 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> +#include <linux/execmem.h> #include "internal.h" static int module_set_memory(const struct module *mod, enum mod_mem_type type, @@ -32,12 +33,12 @@ static int module_set_memory(const struct module *mod, enum mod_mem_type type, int module_enable_text_rox(const struct module *mod) { for_class_mod_mem_type(type, text) { + const struct module_memory *mem = &mod->mem[type]; int ret; - if (mod->mem[type].is_rox) - continue; - - if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + if (mem->is_rox) + ret = execmem_restore_rox(mem->base, mem->size); + else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) ret = module_set_memory(mod, type, set_memory_rox); else ret = module_set_memory(mod, type, set_memory_x); @@ -47,7 +48,7 @@ int module_enable_text_rox(const struct module *mod) return 0; } -int module_enable_rodata_ro(const struct module *mod, bool after_init) +int module_enable_rodata_ro(const struct module *mod) { int ret; @@ -61,12 +62,17 @@ int module_enable_rodata_ro(const struct module *mod, bool after_init) if (ret) return ret; - if (after_init) - return module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro); - return 0; } +int module_enable_rodata_ro_after_init(const struct module *mod) +{ + if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX) || !rodata_enabled) + return 0; + + return module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro); +} + int module_enable_data_nx(const struct module *mod) { if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index 456358e1fdc4..b401ff4b02d2 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -19,24 +19,16 @@ * J. Corbet <corbet@lwn.net> */ #ifdef CONFIG_KALLSYMS -struct module_sect_attr { - struct bin_attribute battr; - unsigned long address; -}; - struct module_sect_attrs { struct attribute_group grp; - unsigned int nsections; - struct module_sect_attr attrs[]; + struct bin_attribute attrs[]; }; #define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4)) static ssize_t module_sect_read(struct file *file, struct kobject *kobj, - struct bin_attribute *battr, + const struct bin_attribute *battr, char *buf, loff_t pos, size_t count) { - struct module_sect_attr *sattr = - container_of(battr, struct module_sect_attr, battr); char bounce[MODULE_SECT_READ_SIZE + 1]; size_t wrote; @@ -53,7 +45,7 @@ static ssize_t module_sect_read(struct file *file, struct kobject *kobj, */ wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n", kallsyms_show_value(file->f_cred) - ? (void *)sattr->address : NULL); + ? battr->private : NULL); count = min(count, wrote); memcpy(buf, bounce, count); @@ -62,59 +54,59 @@ static ssize_t module_sect_read(struct file *file, struct kobject *kobj, static void free_sect_attrs(struct module_sect_attrs *sect_attrs) { - unsigned int section; + const struct bin_attribute *const *bin_attr; - for (section = 0; section < sect_attrs->nsections; section++) - kfree(sect_attrs->attrs[section].battr.attr.name); + for (bin_attr = sect_attrs->grp.bin_attrs_new; *bin_attr; bin_attr++) + kfree((*bin_attr)->attr.name); + kfree(sect_attrs->grp.bin_attrs_new); kfree(sect_attrs); } static int add_sect_attrs(struct module *mod, const struct load_info *info) { - unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; - struct module_sect_attr *sattr; - struct bin_attribute **gattr; + const struct bin_attribute **gattr; + struct bin_attribute *sattr; + unsigned int nloaded = 0, i; int ret; /* Count loaded sections and allocate structures */ for (i = 0; i < info->hdr->e_shnum; i++) if (!sect_empty(&info->sechdrs[i])) nloaded++; - size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), - sizeof(sect_attrs->grp.bin_attrs[0])); - size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); - sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); + sect_attrs = kzalloc(struct_size(sect_attrs, attrs, nloaded), GFP_KERNEL); if (!sect_attrs) return -ENOMEM; + gattr = kcalloc(nloaded + 1, sizeof(*gattr), GFP_KERNEL); + if (!gattr) { + kfree(sect_attrs); + return -ENOMEM; + } + /* Setup section attributes. */ sect_attrs->grp.name = "sections"; - sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; + sect_attrs->grp.bin_attrs_new = gattr; - sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; - gattr = §_attrs->grp.bin_attrs[0]; for (i = 0; i < info->hdr->e_shnum; i++) { Elf_Shdr *sec = &info->sechdrs[i]; if (sect_empty(sec)) continue; - sysfs_bin_attr_init(&sattr->battr); - sattr->address = sec->sh_addr; - sattr->battr.attr.name = + sysfs_bin_attr_init(sattr); + sattr->attr.name = kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); - if (!sattr->battr.attr.name) { + if (!sattr->attr.name) { ret = -ENOMEM; goto out; } - sect_attrs->nsections++; - sattr->battr.read = module_sect_read; - sattr->battr.size = MODULE_SECT_READ_SIZE; - sattr->battr.attr.mode = 0400; - *(gattr++) = &(sattr++)->battr; + sattr->read_new = module_sect_read; + sattr->private = (void *)sec->sh_addr; + sattr->size = MODULE_SECT_READ_SIZE; + sattr->attr.mode = 0400; + *(gattr++) = sattr++; } - *gattr = NULL; ret = sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp); if (ret) @@ -146,20 +138,13 @@ static void remove_sect_attrs(struct module *mod) */ struct module_notes_attrs { - struct kobject *dir; - unsigned int notes; - struct bin_attribute attrs[] __counted_by(notes); + struct attribute_group grp; + struct bin_attribute attrs[]; }; -static void free_notes_attrs(struct module_notes_attrs *notes_attrs, - unsigned int i) +static void free_notes_attrs(struct module_notes_attrs *notes_attrs) { - if (notes_attrs->dir) { - while (i-- > 0) - sysfs_remove_bin_file(notes_attrs->dir, - ¬es_attrs->attrs[i]); - kobject_put(notes_attrs->dir); - } + kfree(notes_attrs->grp.bin_attrs_new); kfree(notes_attrs); } @@ -167,6 +152,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info) { unsigned int notes, loaded, i; struct module_notes_attrs *notes_attrs; + const struct bin_attribute **gattr; struct bin_attribute *nattr; int ret; @@ -185,47 +171,55 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info) if (!notes_attrs) return -ENOMEM; - notes_attrs->notes = notes; + gattr = kcalloc(notes + 1, sizeof(*gattr), GFP_KERNEL); + if (!gattr) { + kfree(notes_attrs); + return -ENOMEM; + } + + notes_attrs->grp.name = "notes"; + notes_attrs->grp.bin_attrs_new = gattr; + nattr = ¬es_attrs->attrs[0]; for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { if (sect_empty(&info->sechdrs[i])) continue; if (info->sechdrs[i].sh_type == SHT_NOTE) { sysfs_bin_attr_init(nattr); - nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; + nattr->attr.name = mod->sect_attrs->attrs[loaded].attr.name; nattr->attr.mode = 0444; nattr->size = info->sechdrs[i].sh_size; nattr->private = (void *)info->sechdrs[i].sh_addr; - nattr->read = sysfs_bin_attr_simple_read; - ++nattr; + nattr->read_new = sysfs_bin_attr_simple_read; + *(gattr++) = nattr++; } ++loaded; } - notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); - if (!notes_attrs->dir) { - ret = -ENOMEM; + ret = sysfs_create_group(&mod->mkobj.kobj, ¬es_attrs->grp); + if (ret) goto out; - } - - for (i = 0; i < notes; ++i) { - ret = sysfs_create_bin_file(notes_attrs->dir, ¬es_attrs->attrs[i]); - if (ret) - goto out; - } mod->notes_attrs = notes_attrs; return 0; out: - free_notes_attrs(notes_attrs, i); + free_notes_attrs(notes_attrs); return ret; } static void remove_notes_attrs(struct module *mod) { - if (mod->notes_attrs) - free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes); + if (mod->notes_attrs) { + sysfs_remove_group(&mod->mkobj.kobj, + &mod->notes_attrs->grp); + /* + * We are positive that no one is using any notes attrs + * at this point. Deallocate immediately. + */ + free_notes_attrs(mod->notes_attrs); + mod->notes_attrs = NULL; + } } #else /* !CONFIG_KALLSYMS */ @@ -275,7 +269,7 @@ static int add_usage_links(struct module *mod) static void module_remove_modinfo_attrs(struct module *mod, int end) { - struct module_attribute *attr; + const struct module_attribute *attr; int i; for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { @@ -293,7 +287,7 @@ static void module_remove_modinfo_attrs(struct module *mod, int end) static int module_add_modinfo_attrs(struct module *mod) { - struct module_attribute *attr; + const struct module_attribute *attr; struct module_attribute *temp_attr; int error = 0; int i; diff --git a/kernel/module/version.c b/kernel/module/version.c index 53f43ac5a73e..3718a8868321 100644 --- a/kernel/module/version.c +++ b/kernel/module/version.c @@ -13,17 +13,34 @@ int check_version(const struct load_info *info, const char *symname, struct module *mod, - const s32 *crc) + const u32 *crc) { Elf_Shdr *sechdrs = info->sechdrs; unsigned int versindex = info->index.vers; unsigned int i, num_versions; struct modversion_info *versions; + struct modversion_info_ext version_ext; /* Exporting module didn't supply crcs? OK, we're already tainted. */ if (!crc) return 1; + /* If we have extended version info, rely on it */ + if (info->index.vers_ext_crc) { + for_each_modversion_info_ext(version_ext, info) { + if (strcmp(version_ext.name, symname) != 0) + continue; + if (*version_ext.crc == *crc) + return 1; + pr_debug("Found checksum %X vs module %X\n", + *crc, *version_ext.crc); + goto bad_version; + } + pr_warn_once("%s: no extended symbol version for %s\n", + info->name, symname); + return 1; + } + /* No versions at all? modprobe --force does this. */ if (versindex == 0) return try_to_force_load(mod, symname) == 0; @@ -87,6 +104,34 @@ int same_magic(const char *amagic, const char *bmagic, return strcmp(amagic, bmagic) == 0; } +void modversion_ext_start(const struct load_info *info, + struct modversion_info_ext *start) +{ + unsigned int crc_idx = info->index.vers_ext_crc; + unsigned int name_idx = info->index.vers_ext_name; + Elf_Shdr *sechdrs = info->sechdrs; + + /* + * Both of these fields are needed for this to be useful + * Any future fields should be initialized to NULL if absent. + */ + if (crc_idx == 0 || name_idx == 0) { + start->remaining = 0; + return; + } + + start->crc = (const u32 *)sechdrs[crc_idx].sh_addr; + start->name = (const char *)sechdrs[name_idx].sh_addr; + start->remaining = sechdrs[crc_idx].sh_size / sizeof(*start->crc); +} + +void modversion_ext_advance(struct modversion_info_ext *vers) +{ + vers->remaining--; + vers->crc++; + vers->name += strlen(vers->name) + 1; +} + /* * Generate the signature for all relevant module structures here. * If these change, we don't want to try to parse the module. diff --git a/kernel/padata.c b/kernel/padata.c index d51bbc76b227..b3d4eacc4f5d 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -47,6 +47,22 @@ struct padata_mt_job_state { static void padata_free_pd(struct parallel_data *pd); static void __init padata_mt_helper(struct work_struct *work); +static inline void padata_get_pd(struct parallel_data *pd) +{ + refcount_inc(&pd->refcnt); +} + +static inline void padata_put_pd_cnt(struct parallel_data *pd, int cnt) +{ + if (refcount_sub_and_test(cnt, &pd->refcnt)) + padata_free_pd(pd); +} + +static inline void padata_put_pd(struct parallel_data *pd) +{ + padata_put_pd_cnt(pd, 1); +} + static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { int cpu, target_cpu; @@ -206,7 +222,7 @@ int padata_do_parallel(struct padata_shell *ps, if ((pinst->flags & PADATA_RESET)) goto out; - refcount_inc(&pd->refcnt); + padata_get_pd(pd); padata->pd = pd; padata->cb_cpu = *cb_cpu; @@ -274,7 +290,7 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd, if (remove_object) { list_del_init(&padata->list); ++pd->processed; - pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false); + pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu); } spin_unlock(&reorder->lock); @@ -336,8 +352,14 @@ static void padata_reorder(struct parallel_data *pd) smp_mb(); reorder = per_cpu_ptr(pd->reorder_list, pd->cpu); - if (!list_empty(&reorder->list) && padata_find_next(pd, false)) + if (!list_empty(&reorder->list) && padata_find_next(pd, false)) { + /* + * Other context(eg. the padata_serial_worker) can finish the request. + * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish. + */ + padata_get_pd(pd); queue_work(pinst->serial_wq, &pd->reorder_work); + } } static void invoke_padata_reorder(struct work_struct *work) @@ -348,6 +370,8 @@ static void invoke_padata_reorder(struct work_struct *work) pd = container_of(work, struct parallel_data, reorder_work); padata_reorder(pd); local_bh_enable(); + /* Pairs with putting the reorder_work in the serial_wq */ + padata_put_pd(pd); } static void padata_serial_worker(struct work_struct *serial_work) @@ -380,8 +404,7 @@ static void padata_serial_worker(struct work_struct *serial_work) } local_bh_enable(); - if (refcount_sub_and_test(cnt, &pd->refcnt)) - padata_free_pd(pd); + padata_put_pd_cnt(pd, cnt); } /** @@ -681,8 +704,7 @@ static int padata_replace(struct padata_instance *pinst) synchronize_rcu(); list_for_each_entry_continue_reverse(ps, &pinst->pslist, list) - if (refcount_dec_and_test(&ps->opd->refcnt)) - padata_free_pd(ps->opd); + padata_put_pd(ps->opd); pinst->flags &= ~PADATA_RESET; @@ -970,7 +992,7 @@ static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr, pinst = kobj2pinst(kobj); pentry = attr2pentry(attr); - if (pentry->show) + if (pentry->store) ret = pentry->store(pinst, attr, buf, count); return ret; @@ -1121,11 +1143,16 @@ void padata_free_shell(struct padata_shell *ps) if (!ps) return; + /* + * Wait for all _do_serial calls to finish to avoid touching + * freed pd's and ps's. + */ + synchronize_rcu(); + mutex_lock(&ps->pinst->lock); list_del(&ps->list); pd = rcu_dereference_protected(ps->pd, 1); - if (refcount_dec_and_test(&pd->refcnt)) - padata_free_pd(pd); + padata_put_pd(pd); mutex_unlock(&ps->pinst->lock); kfree(ps); diff --git a/kernel/panic.c b/kernel/panic.c index fbc59b3b64d0..d8635d5cecb2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -84,7 +84,7 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); #ifdef CONFIG_SYSCTL -static struct ctl_table kern_panic_table[] = { +static const struct ctl_table kern_panic_table[] = { #ifdef CONFIG_SMP { .procname = "oops_all_cpu_backtrace", diff --git a/kernel/params.c b/kernel/params.c index 2e447f8ae183..0074d29c9b80 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -538,7 +538,7 @@ const struct kernel_param_ops param_ops_string = { EXPORT_SYMBOL(param_ops_string); /* sysfs output in /sys/modules/XYZ/parameters/ */ -#define to_module_attr(n) container_of(n, struct module_attribute, attr) +#define to_module_attr(n) container_of_const(n, struct module_attribute, attr) #define to_module_kobject(n) container_of(n, struct module_kobject, kobj) struct param_attribute @@ -555,13 +555,13 @@ struct module_param_attrs }; #ifdef CONFIG_SYSFS -#define to_param_attr(n) container_of(n, struct param_attribute, mattr) +#define to_param_attr(n) container_of_const(n, struct param_attribute, mattr) -static ssize_t param_attr_show(struct module_attribute *mattr, +static ssize_t param_attr_show(const struct module_attribute *mattr, struct module_kobject *mk, char *buf) { int count; - struct param_attribute *attribute = to_param_attr(mattr); + const struct param_attribute *attribute = to_param_attr(mattr); if (!attribute->param->ops->get) return -EPERM; @@ -573,12 +573,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr, } /* sysfs always hands a nul-terminated string in buf. We rely on that. */ -static ssize_t param_attr_store(struct module_attribute *mattr, +static ssize_t param_attr_store(const struct module_attribute *mattr, struct module_kobject *mk, const char *buf, size_t len) { int err; - struct param_attribute *attribute = to_param_attr(mattr); + const struct param_attribute *attribute = to_param_attr(mattr); if (!attribute->param->ops->set) return -EPERM; @@ -857,11 +857,11 @@ static void __init param_sysfs_builtin(void) } } -ssize_t __modver_version_show(struct module_attribute *mattr, +ssize_t __modver_version_show(const struct module_attribute *mattr, struct module_kobject *mk, char *buf) { - struct module_version_attribute *vattr = - container_of(mattr, struct module_version_attribute, mattr); + const struct module_version_attribute *vattr = + container_of_const(mattr, struct module_version_attribute, mattr); return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version); } @@ -892,7 +892,7 @@ static ssize_t module_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct module_attribute *attribute; + const struct module_attribute *attribute; struct module_kobject *mk; int ret; @@ -911,7 +911,7 @@ static ssize_t module_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { - struct module_attribute *attribute; + const struct module_attribute *attribute; struct module_kobject *mk; int ret; diff --git a/kernel/pid.c b/kernel/pid.c index 3a10a7b6fcf8..4ac2ce46817f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -88,20 +88,6 @@ struct pid_namespace init_pid_ns = { }; EXPORT_SYMBOL_GPL(init_pid_ns); -/* - * Note: disable interrupts while the pidmap_lock is held as an - * interrupt might come in and do read_lock(&tasklist_lock). - * - * If we don't disable interrupts there is a nasty deadlock between - * detach_pid()->free_pid() and another cpu that does - * spin_lock(&pidmap_lock) followed by an interrupt routine that does - * read_lock(&tasklist_lock); - * - * After we clean up the tasklist_lock and know there are no - * irq handlers that take it we can leave the interrupts enabled. - * For now it is easier to be safe than to prove it can't happen. - */ - static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); @@ -128,11 +114,11 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { - /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; - unsigned long flags; - spin_lock_irqsave(&pidmap_lock, flags); + lockdep_assert_not_held(&tasklist_lock); + + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; @@ -155,11 +141,23 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } pidfs_remove_pid(pid); - spin_unlock_irqrestore(&pidmap_lock, flags); + spin_unlock(&pidmap_lock); call_rcu(&pid->rcu, delayed_put_pid); } +void free_pids(struct pid **pids) +{ + int tmp; + + /* + * This can batch pidmap_lock. + */ + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) + if (pids[tmp]) + free_pid(pids[tmp]); +} + struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size) { @@ -211,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, @@ -238,7 +236,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); if (nr < 0) { @@ -272,7 +270,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, upid = pid->numbers + ns->level; idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; pidfs_add_pid(pid); @@ -281,18 +279,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); return pid; out_unlock: - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); idr_preload_end(); put_pid_ns(ns); out_free: - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); @@ -302,7 +300,7 @@ out_free: if (ns->pid_allocated == PIDNS_ADDING) idr_set_cursor(&ns->idr, 0); - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); @@ -310,9 +308,9 @@ out_free: void disable_pid_allocation(struct pid_namespace *ns) { - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); ns->pid_allocated &= ~PIDNS_ADDING; - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) @@ -339,17 +337,23 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) */ void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid *pid = *task_pid_ptr(task, type); + struct pid *pid; + + lockdep_assert_held_write(&tasklist_lock); + + pid = *task_pid_ptr(task, type); hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } -static void __change_pid(struct task_struct *task, enum pid_type type, - struct pid *new) +static void __change_pid(struct pid **pids, struct task_struct *task, + enum pid_type type, struct pid *new) { - struct pid **pid_ptr = task_pid_ptr(task, type); - struct pid *pid; + struct pid **pid_ptr, *pid; int tmp; + lockdep_assert_held_write(&tasklist_lock); + + pid_ptr = task_pid_ptr(task, type); pid = *pid_ptr; hlist_del_rcu(&task->pid_links[type]); @@ -364,18 +368,19 @@ static void __change_pid(struct task_struct *task, enum pid_type type, if (pid_has_task(pid, tmp)) return; - free_pid(pid); + WARN_ON(pids[type]); + pids[type] = pid; } -void detach_pid(struct task_struct *task, enum pid_type type) +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type) { - __change_pid(task, type, NULL); + __change_pid(pids, task, type, NULL); } -void change_pid(struct task_struct *task, enum pid_type type, +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type, struct pid *pid) { - __change_pid(task, type, pid); + __change_pid(pids, task, type, pid); attach_pid(task, type); } @@ -386,6 +391,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right) struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + lockdep_assert_held_write(&tasklist_lock); + /* Swap the single entry tid lists */ hlists_swap_heads_rcu(head1, head2); @@ -403,6 +410,7 @@ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { WARN_ON_ONCE(type == PIDTYPE_PID); + lockdep_assert_held_write(&tasklist_lock); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } @@ -564,15 +572,29 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) */ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) { - unsigned int f_flags; + unsigned int f_flags = 0; struct pid *pid; struct task_struct *task; + enum pid_type type; - pid = pidfd_get_pid(pidfd, &f_flags); - if (IS_ERR(pid)) - return ERR_CAST(pid); + switch (pidfd) { + case PIDFD_SELF_THREAD: + type = PIDTYPE_PID; + pid = get_task_pid(current, type); + break; + case PIDFD_SELF_THREAD_GROUP: + type = PIDTYPE_TGID; + pid = get_task_pid(current, type); + break; + default: + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + type = PIDTYPE_TGID; + break; + } - task = get_pid_task(pid, PIDTYPE_TGID); + task = get_pid_task(pid, type); put_pid(pid); if (!task) return ERR_PTR(-ESRCH); @@ -695,7 +717,7 @@ static struct ctl_table_root pid_table_root = { .set_ownership = pid_table_root_set_ownership, }; -static struct ctl_table pid_table[] = { +static const struct ctl_table pid_table[] = { { .procname = "pid_max", .data = &init_pid_ns.pid_max, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index f1ffa032fc32..7098ed44e717 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -107,7 +107,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns goto out_free_idr; ns->ns.ops = &pidns_operations; - ns->pid_max = parent_pid_ns->pid_max; + ns->pid_max = PID_MAX_LIMIT; err = register_pidns_sysctls(ns); if (err) goto out_free_inum; @@ -303,7 +303,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write, return ret; } -static struct ctl_table pid_ns_ctl_table[] = { +static const struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h index 18ecaef6be41..5d8f981de7c5 100644 --- a/kernel/pid_sysctl.h +++ b/kernel/pid_sysctl.h @@ -31,7 +31,7 @@ static int pid_mfd_noexec_dointvec_minmax(const struct ctl_table *table, return err; } -static struct ctl_table pid_ns_ctl_table_vm[] = { +static const struct ctl_table pid_ns_ctl_table_vm[] = { { .procname = "memfd_noexec", .data = &init_pid_ns.memfd_noexec_scope, diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index afce8130d8b9..54a623680019 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -257,11 +257,30 @@ config DPM_WATCHDOG boot session. config DPM_WATCHDOG_TIMEOUT - int "Watchdog timeout in seconds" + int "Watchdog timeout to panic in seconds" range 1 120 default 120 depends on DPM_WATCHDOG +config DPM_WATCHDOG_WARNING_TIMEOUT + int "Watchdog timeout to warn in seconds" + range 1 DPM_WATCHDOG_TIMEOUT + default DPM_WATCHDOG_TIMEOUT + depends on DPM_WATCHDOG + help + If the DPM watchdog warning timeout and main timeout are + different then a non-fatal warning (with a stack trace of + the stuck suspend routine) will be printed when the warning + timeout expires. If the suspend routine gets un-stuck + before the main timeout expires then no other action is + taken. If the routine continues to be stuck and the main + timeout expires then an emergency-level message and stack + trace will be printed and the system will panic. + + If the warning timeout is equal to the main timeout (the + default) then the warning will never happen and the system + will jump straight to panic when the main timeout expires. + config PM_TRACE bool help @@ -361,8 +380,7 @@ config CPU_PM config ENERGY_MODEL bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)" - depends on SMP - depends on CPU_FREQ + depends on CPU_FREQ || PM_DEVFREQ help Several subsystems (thermal and/or the task scheduler for example) can leverage information about the energy consumed by devices to diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index b29c8aca7486..865df641b97c 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -9,7 +9,6 @@ #include <linux/device.h> #include <linux/mutex.h> -#include <linux/pm_wakeup.h> #include "power.h" diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index d07faf42eace..d9b7e2b38c7a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -161,22 +161,10 @@ static void em_debug_create_pd(struct device *dev) {} static void em_debug_remove_pd(struct device *dev) {} #endif -static void em_destroy_table_rcu(struct rcu_head *rp) -{ - struct em_perf_table __rcu *table; - - table = container_of(rp, struct em_perf_table, rcu); - kfree(table); -} - static void em_release_table_kref(struct kref *kref) { - struct em_perf_table __rcu *table; - /* It was the last owner of this table so we can free */ - table = container_of(kref, struct em_perf_table, kref); - - call_rcu(&table->rcu, em_destroy_table_rcu); + kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu); } /** @@ -185,7 +173,7 @@ static void em_release_table_kref(struct kref *kref) * * No return values. */ -void em_table_free(struct em_perf_table __rcu *table) +void em_table_free(struct em_perf_table *table) { kref_put(&table->kref, em_release_table_kref); } @@ -198,9 +186,9 @@ void em_table_free(struct em_perf_table __rcu *table) * has a user. * Returns allocated table or NULL. */ -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) { - struct em_perf_table __rcu *table; + struct em_perf_table *table; int table_size; table_size = sizeof(struct em_perf_state) * pd->nr_perf_states; @@ -239,7 +227,7 @@ static void em_init_performance(struct device *dev, struct em_perf_domain *pd, } static int em_compute_costs(struct device *dev, struct em_perf_state *table, - struct em_data_callback *cb, int nr_states, + const struct em_data_callback *cb, int nr_states, unsigned long flags) { unsigned long prev_cost = ULONG_MAX; @@ -308,9 +296,9 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, * Return 0 on success or an error code on failure. */ int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table) + struct em_perf_table *new_table) { - struct em_perf_table __rcu *old_table; + struct em_perf_table *old_table; struct em_perf_domain *pd; if (!dev) @@ -327,7 +315,8 @@ int em_dev_update_perf_domain(struct device *dev, kref_get(&new_table->kref); - old_table = pd->em_table; + old_table = rcu_dereference_protected(pd->em_table, + lockdep_is_held(&em_pd_mutex)); rcu_assign_pointer(pd->em_table, new_table); em_cpufreq_update_efficiencies(dev, new_table->state); @@ -341,7 +330,7 @@ EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, struct em_perf_state *table, - struct em_data_callback *cb, + const struct em_data_callback *cb, unsigned long flags) { unsigned long power, freq, prev_freq = 0; @@ -396,10 +385,11 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, } static int em_create_pd(struct device *dev, int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, + const struct em_data_callback *cb, + const cpumask_t *cpus, unsigned long flags) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; int cpu, ret, num_cpus; @@ -556,9 +546,10 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * Return 0 on success */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, - bool microwatts) + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { + struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; unsigned long flags = 0; int cpu, ret; @@ -631,7 +622,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, dev->em_pd->min_perf_state = 0; dev->em_pd->max_perf_state = nr_states - 1; - em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state); + em_table = rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex)); + em_cpufreq_update_efficiencies(dev, em_table->state); em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); @@ -668,7 +661,8 @@ void em_dev_unregister_perf_domain(struct device *dev) mutex_lock(&em_pd_mutex); em_debug_remove_pd(dev); - em_table_free(dev->em_pd->em_table); + em_table_free(rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex))); kfree(dev->em_pd); dev->em_pd = NULL; @@ -676,9 +670,9 @@ void em_dev_unregister_perf_domain(struct device *dev) } EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); -static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) +static struct em_perf_table *em_table_dup(struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_state *ps, *new_ps; int ps_size; @@ -700,7 +694,7 @@ static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) } static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, - struct em_perf_table __rcu *em_table) + struct em_perf_table *em_table) { int ret; @@ -728,10 +722,9 @@ free_em_table: * are correctly calculated. */ static void em_adjust_new_capacity(struct device *dev, - struct em_perf_domain *pd, - u64 max_cap) + struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; em_table = em_table_dup(pd); if (!em_table) { @@ -775,7 +768,8 @@ static void em_check_capacity_update(void) } cpufreq_cpu_put(policy); - pd = em_cpu_get(cpu); + dev = get_cpu_device(cpu); + pd = em_pd_get(dev); if (!pd || em_is_artificial(pd)) continue; @@ -799,8 +793,7 @@ static void em_check_capacity_update(void) pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu, cpu_capacity, em_max_perf); - dev = get_cpu_device(cpu); - em_adjust_new_capacity(dev, pd, cpu_capacity); + em_adjust_new_capacity(dev, pd); } free_cpumask_var(cpu_done_mask); @@ -822,7 +815,7 @@ static void em_update_workfn(struct work_struct *work) */ int em_dev_update_chip_binning(struct device *dev) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; int i, ret; @@ -908,3 +901,20 @@ int em_update_performance_limits(struct em_perf_domain *pd, return 0; } EXPORT_SYMBOL_GPL(em_update_performance_limits); + +static void rebuild_sd_workfn(struct work_struct *work) +{ + rebuild_sched_domains_energy(); +} + +void em_rebuild_sched_domains(void) +{ + static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); + + /* + * When called from the cpufreq_register_driver() path, the + * cpu_hotplug_lock is already held, so use a work item to + * avoid nested locking in rebuild_sched_domains(). + */ + schedule_work(&rebuild_sd_work); +} diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1f87aa01ba44..b129ed1d25a8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -608,7 +608,11 @@ int hibernation_platform_enter(void) local_irq_disable(); system_state = SYSTEM_SUSPEND; - syscore_suspend(); + + error = syscore_suspend(); + if (error) + goto Enable_irqs; + if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; @@ -620,6 +624,7 @@ int hibernation_platform_enter(void) Power_up: syscore_resume(); + Enable_irqs: system_state = SYSTEM_RUNNING; local_irq_enable(); @@ -1441,10 +1446,10 @@ static const char * const comp_alg_enabled[] = { static int hibernate_compressor_param_set(const char *compressor, const struct kernel_param *kp) { - unsigned int sleep_flags; int index, ret; - sleep_flags = lock_system_sleep(); + if (!mutex_trylock(&system_transition_mutex)) + return -EBUSY; index = sysfs_match_string(comp_alg_enabled, compressor); if (index >= 0) { @@ -1456,7 +1461,7 @@ static int hibernate_compressor_param_set(const char *compressor, ret = index; } - unlock_system_sleep(sleep_flags); + mutex_unlock(&system_transition_mutex); if (ret) pr_debug("Cannot set specified compressor %s\n", diff --git a/kernel/power/power.h b/kernel/power/power.h index de0e6b1077f2..c352dea2f67b 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -110,7 +110,7 @@ extern int hibernate_preallocate_memory(void); extern void clear_or_poison_free_pages(void); -/** +/* * Auxiliary structure used for reading the snapshot image data and * metadata from and writing them to the list of page backup entries * (PBEs) which is the main data structure of swsusp. diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 30894d8f0a78..4e6e24e8b854 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1011,11 +1011,8 @@ void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pf } } /* This allocation cannot fail */ - region = memblock_alloc(sizeof(struct nosave_region), + region = memblock_alloc_or_panic(sizeof(struct nosave_region), SMP_CACHE_BYTES); - if (!region) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); @@ -2273,9 +2270,9 @@ int snapshot_read_next(struct snapshot_handle *handle) */ void *kaddr; - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); copy_page(buffer, kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); handle->buffer = buffer; } else { handle->buffer = page_address(page); @@ -2564,9 +2561,9 @@ static void copy_last_highmem_page(void) if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page); + dst = kmap_local_page(last_highmem_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); last_highmem_page = NULL; } } @@ -2884,13 +2881,13 @@ static inline void swap_two_pages_data(struct page *p1, struct page *p2, { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1); - kaddr2 = kmap_atomic(p2); + kaddr1 = kmap_local_page(p1); + kaddr2 = kmap_local_page(p2); copy_page(buf, kaddr1); copy_page(kaddr1, kaddr2); copy_page(kaddr2, buf); - kunmap_atomic(kaddr2); - kunmap_atomic(kaddr1); + kunmap_local(kaddr2); + kunmap_local(kaddr1); } /** diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 09f8397bae15..6fae1e0a331c 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -91,6 +91,16 @@ static void s2idle_enter(void) { trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); + /* + * The correctness of the code below depends on the number of online + * CPUs being stable, but CPUs cannot be taken offline or put online + * while it is running. + * + * The s2idle_lock must be acquired before the pending wakeup check to + * prevent pm_system_wakeup() from running as a whole between that check + * and the subsequent s2idle_state update in which case a wakeup event + * would get lost. + */ raw_spin_lock_irq(&s2idle_lock); if (pm_wakeup_pending()) goto out; @@ -98,8 +108,6 @@ static void s2idle_enter(void) s2idle_state = S2IDLE_STATE_ENTER; raw_spin_unlock_irq(&s2idle_lock); - cpus_read_lock(); - /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ @@ -112,8 +120,6 @@ static void s2idle_enter(void) */ wake_up_all_idle_cpus(); - cpus_read_unlock(); - raw_spin_lock_irq(&s2idle_lock); out: diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index c6bb47666aef..a91bdf802967 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -338,3 +338,9 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped); void console_prepend_replay(struct printk_message *pmsg); #endif + +#ifdef CONFIG_SMP +bool is_printk_cpu_sync_owner(void); +#else +static inline bool is_printk_cpu_sync_owner(void) { return false; } +#endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 80910bc3470c..057db78876cd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -523,7 +523,7 @@ static struct latched_seq clear_seq = { /* record buffer */ #define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) -#define LOG_BUF_LEN_MAX (u32)(1 << 31) +#define LOG_BUF_LEN_MAX ((u32)1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -2461,7 +2461,6 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); -static bool pr_flush(int timeout_ms, bool reset_on_progress); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ @@ -2474,7 +2473,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre static u64 syslog_seq; -static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ @@ -4466,7 +4464,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * Context: Process context. May sleep while acquiring console lock. * Return: true if all usable printers are caught up. */ -static bool pr_flush(int timeout_ms, bool reset_on_progress) +bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } @@ -4922,6 +4920,11 @@ void console_try_replay_all(void) static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); +bool is_printk_cpu_sync_owner(void) +{ + return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id()); +} + /** * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant * spinning lock is not owned by any CPU. diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 6f94418d53ff..32a28f563b13 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -61,10 +61,15 @@ bool is_printk_legacy_deferred(void) /* * The per-CPU variable @printk_context can be read safely in any * context. CPU migration is always disabled when set. + * + * A context holding the printk_cpu_sync must not spin waiting for + * another CPU. For legacy printing, it could be the console_lock + * or the port lock. */ return (force_legacy_kthread() || this_cpu_read(printk_context) || - in_nmi()); + in_nmi() || + is_printk_cpu_sync_owner()); } asmlinkage int vprintk(const char *fmt, va_list args) @@ -74,15 +79,6 @@ asmlinkage int vprintk(const char *fmt, va_list args) if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); #endif - - /* - * Use the main logbuf even in NMI. But avoid calling console - * drivers that might have their own locks. - */ - if (is_printk_legacy_deferred()) - return vprintk_deferred(fmt, args); - - /* No obstacles. */ return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk); diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c index f5072dc85f7a..da77f3f5c1fe 100644 --- a/kernel/printk/sysctl.c +++ b/kernel/printk/sysctl.c @@ -20,7 +20,7 @@ static int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int writ return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } -static struct ctl_table printk_sysctls[] = { +static const struct ctl_table printk_sysctls[] = { { .procname = "printk", .data = &console_loglevel, diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b9b6bc55185d..aa42de4d2768 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -18,7 +18,7 @@ config TREE_RCU config PREEMPT_RCU bool - default y if PREEMPTION + default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC) select TREE_RCU help This option selects the RCU implementation that is @@ -65,6 +65,17 @@ config TREE_SRCU help This option selects the full-fledged version of SRCU. +config FORCE_NEED_SRCU_NMI_SAFE + bool "Force selection of NEED_SRCU_NMI_SAFE" + depends on !TINY_SRCU + select NEED_SRCU_NMI_SAFE + default n + help + This option forces selection of the NEED_SRCU_NMI_SAFE + Kconfig option, allowing testing of srcu_read_lock_nmisafe() + and srcu_read_unlock_nmisafe() on architectures (like x86) + that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option. + config NEED_SRCU_NMI_SAFE def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU @@ -91,7 +102,7 @@ config NEED_TASKS_RCU config TASKS_RCU bool - default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) + default NEED_TASKS_RCU && PREEMPTION select IRQ_WORK config FORCE_TASKS_RUDE_RCU @@ -323,21 +334,27 @@ config RCU_LAZY depends on RCU_NOCB_CPU default n help - To save power, batch RCU callbacks and flush after delay, memory - pressure, or callback list growing too big. + To save power, batch RCU callbacks and delay starting the + corresponding grace period for multiple seconds. The grace + period will be started after this delay, in case of memory + pressure, or if the corresponding CPU's callback list grows + too large. - Requires rcu_nocbs=all to be set. + These delays happen only on rcu_nocbs CPUs, that is, CPUs + whose callbacks have been offloaded. - Use rcutree.enable_rcu_lazy=0 to turn it off at boot time. + Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to + globally disable these delays. config RCU_LAZY_DEFAULT_OFF bool "Turn RCU lazy invocation off by default" depends on RCU_LAZY default n help - Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default - off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch - it back on. + Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel + to boot with these energy-efficiency delays disabled. Use the + rcutree.enable_rcu_lazy=0 kernel-boot parameter to override + the this option at boot time, thus re-enabling these delays. config RCU_DOUBLE_CHECK_CB_TIME bool "RCU callback-batch backup time check" diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 9b0b52e1836f..12e4c64ebae1 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -53,6 +53,51 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. +config RCU_TORTURE_TEST_CHK_RDR_STATE + bool "Check rcutorture reader state" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to check the desired rcutorture + reader state for each segment against the actual context. + Note that PREEMPT_COUNT must be enabled if the preempt-disabled + and bh-disabled checks are to take effect, and that PREEMPT_RCU + must be enabled for the RCU-nesting checks to take effect. + These checks add overhead, and this Kconfig options is therefore + disabled by default. + + Say Y here if you want rcutorture reader contexts checked. + Say N if you are unsure. + +config RCU_TORTURE_TEST_LOG_CPU + bool "Log CPU for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + number of the CPU that the reader was running on at the time. + This information can be useful, but it does incur additional + overhead, overhead that can make both failures and close calls + less probable. + + Say Y here if you want CPU IDs logged. + Say N if you are unsure. + +config RCU_TORTURE_TEST_LOG_GP + bool "Log grace-period numbers for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + corresponding grace-period sequence numbers. This information + can be useful, but it does incur additional overhead, overhead + that can make both failures and close calls less probable. + + Say Y here if you want grace-period sequence numbers logged. + Say N if you are unsure. + config RCU_REF_SCALE_TEST tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index feb3ac1dc5d5..eed2951a4962 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -162,7 +162,7 @@ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s) { unsigned long cur_s = READ_ONCE(*sp); - return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1)); + return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1)); } /* @@ -590,6 +590,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif static inline void rcu_gp_set_torture_wait(int duration) { } #endif +unsigned long long rcutorture_gather_gp_seqs(void); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len); #ifdef CONFIG_TINY_SRCU @@ -611,8 +613,6 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } -static inline unsigned long -srcu_batches_completed(struct srcu_struct *sp) { return 0; } static inline void rcu_force_quiescent_state(void) { } static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; } static inline void show_rcu_gp_kthreads(void) { } @@ -624,7 +624,6 @@ static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } bool rcu_watching_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); -unsigned long srcu_batches_completed(struct srcu_struct *sp); bool rcu_check_boost_fail(unsigned long gp_state, int *cpup); void show_rcu_gp_kthreads(void); int rcu_get_gp_kthreads_prio(void); @@ -636,6 +635,12 @@ void rcu_gp_slow_register(atomic_t *rgssp); void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TINY_SRCU +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; } +#else // #ifdef CONFIG_TINY_SRCU +unsigned long srcu_batches_completed(struct srcu_struct *sp); +#endif // #else // #ifdef CONFIG_TINY_SRCU + #ifdef CONFIG_RCU_NOCB_CPU void rcu_bind_current_to_nocb(void); #else diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 612d27690335..65095664f5c5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -92,12 +92,20 @@ torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives"); torture_param(bool, gp_cond_exp_full, false, "Use conditional/async full-stateexpedited GP wait primitives"); +torture_param(int, gp_cond_wi, 16 * USEC_PER_SEC / HZ, + "Wait interval for normal conditional grace periods, us (default 16 jiffies)"); +torture_param(int, gp_cond_wi_exp, 128, + "Wait interval for expedited conditional grace periods, us (default 128 us)"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives"); torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives"); torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives"); +torture_param(int, gp_poll_wi, 16 * USEC_PER_SEC / HZ, + "Wait interval for normal polled grace periods, us (default 16 jiffies)"); +torture_param(int, gp_poll_wi_exp, 128, + "Wait interval for expedited polled grace periods, us (default 128 us)"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); @@ -109,9 +117,11 @@ torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); +torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable"); +torture_param(int, preempt_interval, MSEC_PER_SEC, "Interval between preemptions (ms)"); torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable"); -torture_param(int, reader_flavor, 0x1, "Reader flavors to use, one per bit."); +torture_param(int, reader_flavor, SRCU_READ_FLAVOR_NORMAL, "Reader flavors to use, one per bit."); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -125,6 +135,7 @@ torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s torture_param(int, stutter, 5, "Number of seconds to run/halt test"); torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds."); +torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds."); torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds."); torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable."); torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs"); @@ -137,6 +148,7 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); static int nrealnocbers; static int nrealreaders; +static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; @@ -149,6 +161,7 @@ static struct task_struct **fwd_prog_tasks; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; static struct task_struct *read_exit_task; +static struct task_struct *preempt_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -259,10 +272,16 @@ struct rt_read_seg { unsigned long rt_delay_ms; unsigned long rt_delay_us; bool rt_preempted; + int rt_cpu; + int rt_end_cpu; + unsigned long long rt_gp_seq; + unsigned long long rt_gp_seq_end; + u64 rt_ts; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; static int rt_read_nsegs; +static int rt_read_preempted; static const char *rcu_torture_writer_state_getname(void) { @@ -353,7 +372,8 @@ struct rcu_torture_ops { void (*read_delay)(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp); void (*readunlock)(int idx); - int (*readlock_held)(void); + int (*readlock_held)(void); // lockdep. + int (*readlock_nesting)(void); // actual nesting, if available, -1 if not. unsigned long (*get_gp_seq)(void); unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); @@ -390,6 +410,9 @@ struct rcu_torture_ops { void (*get_gp_data)(int *flags, unsigned long *gp_seq); void (*gp_slow_register)(atomic_t *rgssp); void (*gp_slow_unregister)(atomic_t *rgssp); + bool (*reader_blocked)(void); + unsigned long long (*gather_gp_seqs)(void); + void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len); long cbflood_max; int irq_capable; int can_boost; @@ -448,10 +471,8 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) rtrsp->rt_delay_us = shortdelay_us; } if (!preempt_count() && - !(torture_random(rrsp) % (nrealreaders * 500))) { + !(torture_random(rrsp) % (nrealreaders * 500))) torture_preempt_schedule(); /* QS only if preemptible. */ - rtrsp->rt_preempted = true; - } } static void rcu_torture_read_unlock(int idx) @@ -459,6 +480,15 @@ static void rcu_torture_read_unlock(int idx) rcu_read_unlock(); } +static int rcu_torture_readlock_nesting(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + return rcu_preempt_depth(); + if (IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return (preempt_count() & PREEMPT_MASK); + return -1; +} + /* * Update callback in the pipe. This should be invoked after a grace period. */ @@ -548,6 +578,7 @@ static struct rcu_torture_ops rcu_ops = { .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .readlock_held = torture_readlock_not_held, + .readlock_nesting = rcu_torture_readlock_nesting, .get_gp_seq = rcu_get_gp_seq, .gp_diff = rcu_seq_diff, .deferred_free = rcu_torture_deferred_free, @@ -573,6 +604,7 @@ static struct rcu_torture_ops rcu_ops = { .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full, .poll_gp_state_exp = poll_state_synchronize_rcu, .cond_sync_exp = cond_synchronize_rcu_expedited, + .cond_sync_exp_full = cond_synchronize_rcu_expedited_full, .call = call_rcu_hurry, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, @@ -582,6 +614,11 @@ static struct rcu_torture_ops rcu_ops = { .get_gp_data = rcutorture_get_gp_data, .gp_slow_register = rcu_gp_slow_register, .gp_slow_unregister = rcu_gp_slow_unregister, + .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU) + ? has_rcu_reader_blocked + : NULL, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -627,7 +664,10 @@ static struct rcu_torture_ops rcu_busted_ops = { .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, + .gather_gp_seqs = rcutorture_gather_gp_seqs, + .format_gp_seqs = rcutorture_format_gp_seqs, .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted" }; @@ -648,23 +688,32 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) static int srcu_torture_read_lock(void) { int idx; + struct srcu_ctr __percpu *scp; int ret = 0; - if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) { + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { idx = srcu_read_lock(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); ret += idx; } - if (reader_flavor & 0x2) { + if (reader_flavor & SRCU_READ_FLAVOR_NMI) { idx = srcu_read_lock_nmisafe(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); ret += idx << 1; } - if (reader_flavor & 0x4) { + if (reader_flavor & SRCU_READ_FLAVOR_LITE) { idx = srcu_read_lock_lite(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); ret += idx << 2; } + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_read_lock_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + ret += idx << 3; + } return ret; } @@ -690,11 +739,13 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); - if (reader_flavor & 0x4) + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); + if (reader_flavor & SRCU_READ_FLAVOR_LITE) srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); - if (reader_flavor & 0x2) + if (reader_flavor & SRCU_READ_FLAVOR_NMI) srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); - if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) srcu_read_unlock(srcu_ctlp, idx & 0x1); } @@ -762,6 +813,7 @@ static struct rcu_torture_ops srcu_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -805,6 +857,7 @@ static struct rcu_torture_ops srcud_ops = { .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, + .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, @@ -857,7 +910,7 @@ static void synchronize_rcu_trivial(void) int cpu; for_each_online_cpu(cpu) { - torture_sched_setaffinity(current->pid, cpumask_of(cpu)); + torture_sched_setaffinity(current->pid, cpumask_of(cpu), true); WARN_ON_ONCE(raw_smp_processor_id() != cpu); } } @@ -1119,8 +1172,19 @@ static int rcu_torture_boost(void *arg) unsigned long gp_state; unsigned long gp_state_time; unsigned long oldstarttime; + unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ; - VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) { + VERBOSE_TOROUT_STRING("rcu_torture_boost started"); + } else { + VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period"); + while (time_before(jiffies, booststarttime)) { + schedule_timeout_idle(HZ); + if (kthread_should_stop()) + goto cleanup; + } + VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period"); + } /* Set real-time priority. */ sched_set_fifo_low(current); @@ -1196,6 +1260,7 @@ checkwait: if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); } while (!torture_must_stop()); +cleanup: /* Clean up and exit. */ while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); @@ -1347,6 +1412,7 @@ static void rcu_torture_write_types(void) pr_alert("%s: gp_sync without primitives.\n", __func__); } pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes); + pr_info("%s: gp_cond_wi %d gp_cond_wi_exp %d gp_poll_wi %d gp_poll_wi_exp %d\n", __func__, gp_cond_wi, gp_cond_wi_exp, gp_poll_wi, gp_poll_wi_exp); } /* @@ -1513,7 +1579,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET: rcu_torture_writer_state = RTWS_COND_GET; gp_snap = cur_ops->get_gp_state(); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC; cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); @@ -1521,7 +1588,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET_EXP: rcu_torture_writer_state = RTWS_COND_GET_EXP; gp_snap = cur_ops->get_gp_state_exp(); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC_EXP; cur_ops->cond_sync_exp(gp_snap); rcu_torture_pipe_update(old_rp); @@ -1529,7 +1597,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET_FULL: rcu_torture_writer_state = RTWS_COND_GET_FULL; cur_ops->get_gp_state_full(&gp_snap_full); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC_FULL; cur_ops->cond_sync_full(&gp_snap_full); rcu_torture_pipe_update(old_rp); @@ -1537,7 +1606,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET_EXP_FULL: rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL; cur_ops->get_gp_state_full(&gp_snap_full); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL; cur_ops->cond_sync_exp_full(&gp_snap_full); rcu_torture_pipe_update(old_rp); @@ -1557,8 +1627,8 @@ rcu_torture_writer(void *arg) break; } WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi, + 1000, &rand); } rcu_torture_pipe_update(old_rp); break; @@ -1578,8 +1648,8 @@ rcu_torture_writer(void *arg) break; } WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi, + 1000, &rand); } rcu_torture_pipe_update(old_rp); break; @@ -1588,8 +1658,8 @@ rcu_torture_writer(void *arg) gp_snap = cur_ops->start_gp_poll_exp(); rcu_torture_writer_state = RTWS_POLL_WAIT_EXP; while (!cur_ops->poll_gp_state_exp(gp_snap)) - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp, + 1000, &rand); rcu_torture_pipe_update(old_rp); break; case RTWS_POLL_GET_EXP_FULL: @@ -1597,8 +1667,8 @@ rcu_torture_writer(void *arg) cur_ops->start_gp_poll_exp_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL; while (!cur_ops->poll_gp_state_full(&gp_snap_full)) - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp, + 1000, &rand); rcu_torture_pipe_update(old_rp); break; case RTWS_SYNC: @@ -1694,7 +1764,7 @@ rcu_torture_fakewriter(void *arg) do { torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && - torture_random(&rand) % (nfakewriters * 8) == 0) { + torture_random(&rand) % (nrealfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else { switch (synctype[torture_random(&rand) % nsynctypes]) { @@ -1835,6 +1905,62 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign. } +// Verify the specified RCUTORTURE_RDR* state. +#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() +static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) +{ + int mask; + + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) + return; + + WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled(), ROEC_ARGS); + WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS); + + // If CONFIG_PREEMPT_COUNT=n, further checks are unreliable. + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && + !(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && + !(preempt_count() & PREEMPT_MASK), ROEC_ARGS); + WARN_ONCE(cur_ops->readlock_nesting && + (curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && + cur_ops->readlock_nesting() == 0, ROEC_ARGS); + + // Timer handlers have all sorts of stuff disabled, so ignore + // unintended disabling. + if (insoftirq) + return; + + WARN_ONCE(cur_ops->extendables && + !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && + (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + + /* + * non-preemptible RCU in a preemptible kernel uses preempt_disable() + * as rcu_read_lock(). + */ + mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + + WARN_ONCE(cur_ops->extendables && !(curstate & mask) && + (preempt_count() & PREEMPT_MASK), ROEC_ARGS); + + /* + * non-preemptible RCU in a preemptible kernel uses "preempt_count() & + * PREEMPT_MASK" as ->readlock_nesting(). + */ + mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; + if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) + mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + + WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) && + cur_ops->readlock_nesting() > 0, ROEC_ARGS); +} + /* * Do one extension of an RCU read-side critical section using the * current reader state in readstate (set to zero for initial entry @@ -1844,10 +1970,11 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, * beginning or end of the critical section and if there was actually a * change, do a ->read_delay(). */ -static void rcutorture_one_extend(int *readstate, int newstate, +static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { + bool first; unsigned long flags; int idxnew1 = -1; int idxnew2 = -1; @@ -1856,8 +1983,10 @@ static void rcutorture_one_extend(int *readstate, int newstate, int statesnew = ~*readstate & newstate; int statesold = *readstate & ~newstate; + first = idxold1 == 0; WARN_ON_ONCE(idxold2 < 0); WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); + rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -1876,6 +2005,28 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesnew & RCUTORTURE_RDR_RCU_2) idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; + // Complain unless both the old and the new protection is in place. + rcutorture_one_extend_check("during change", + idxold1 | statesnew, statesnew, statesold, insoftirq); + + // Sample CPU under both sets of protections to reduce confusion. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { + int cpu = raw_smp_processor_id(); + rtrsp->rt_cpu = cpu; + if (!first) { + rtrsp[-1].rt_end_cpu = cpu; + if (cur_ops->reader_blocked) + rtrsp[-1].rt_preempted = cur_ops->reader_blocked(); + } + } + // Sample grace-period sequence number, as good a place as any. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) { + rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs(); + rtrsp->rt_ts = ktime_get_mono_fast_ns(); + if (!first) + rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq; + } + /* * Next, remove old protection, in decreasing order of strength * to avoid unlock paths that aren't safe in the stronger @@ -1926,6 +2077,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, WARN_ON_ONCE(*readstate < 0); if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) pr_info("Unexpected readstate value of %#x\n", *readstate); + rcutorture_one_extend_check("after change", *readstate, statesnew, statesold, insoftirq); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -1992,7 +2144,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) * critical section. */ static struct rt_read_seg * -rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, +rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { int i; @@ -2007,7 +2159,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); - rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]); + rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]); } return &rtrsp[j]; } @@ -2028,6 +2180,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) int newstate; struct rcu_torture *p; int pipe_count; + bool preempted = false; int readstate = 0; struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } }; struct rt_read_seg *rtrsp = &rtseg[0]; @@ -2036,7 +2189,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); - rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); + rcutorture_one_extend(&readstate, newstate, myid < 0, trsp, rtrsp++); if (checkpolling) { if (cur_ops->get_gp_state && cur_ops->poll_gp_state) cookie = cur_ops->get_gp_state(); @@ -2049,13 +2202,13 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) !cur_ops->readlock_held || cur_ops->readlock_held()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&readstate, 0, trsp, rtrsp); + rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); return false; } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); rcu_torture_reader_do_mbchk(myid, p, trsp); - rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); + rtrsp = rcutorture_loop_extend(&readstate, myid < 0, trsp, rtrsp); preempt_disable(); pipe_count = READ_ONCE(p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -2093,7 +2246,9 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) rcu_torture_writer_state, cpumask_pr_args(cpu_online_mask)); } - rcutorture_one_extend(&readstate, 0, trsp, rtrsp); + if (cur_ops->reader_blocked) + preempted = cur_ops->reader_blocked(); + rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. @@ -2105,6 +2260,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++) err_segs[i++] = *rtrsp1; rt_read_nsegs = i; + rt_read_preempted = preempted; } return true; @@ -2417,7 +2573,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "shuffle_interval=%d stutter=%d irqreader=%d " "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " + "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " @@ -2425,12 +2581,13 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "read_exit_delay=%d read_exit_burst=%d " "reader_flavor=%x " "nocbs_nthreads=%d nocbs_toggle=%d " - "test_nmis=%d\n", - torture_type, tag, nrealreaders, nfakewriters, + "test_nmis=%d " + "preempt_duration=%d preempt_interval=%d\n", + torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, + test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, @@ -2438,7 +2595,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) read_exit_delay, read_exit_burst, reader_flavor, nocbs_nthreads, nocbs_toggle, - test_nmis); + test_nmis, + preempt_duration, preempt_interval); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -3068,12 +3226,12 @@ static int __init rcu_torture_fwd_prog_init(void) fwd_progress = 0; return 0; } - if (stall_cpu > 0) { - VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing"); + if (stall_cpu > 0 || (preempt_duration > 0 && IS_ENABLED(CONFIG_RCU_NOCB_CPU))) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall and/or preemption testing"); fwd_progress = 0; if (IS_MODULE(CONFIG_RCU_TORTURE_TEST)) return -EINVAL; /* In module, can fail back to user. */ - WARN_ON(1); /* Make sure rcutorture notices conflict. */ + WARN_ON(1); /* Make sure rcutorture scripting notices conflict. */ return 0; } if (fwd_progress_holdoff <= 0) @@ -3418,6 +3576,35 @@ static void rcutorture_test_nmis(int n) #endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) } +// Randomly preempt online CPUs. +static int rcu_torture_preempt(void *unused) +{ + int cpu = -1; + DEFINE_TORTURE_RANDOM(rand); + + schedule_timeout_idle(stall_cpu_holdoff); + do { + // Wait for preempt_interval ms with up to 100us fuzz. + torture_hrtimeout_ms(preempt_interval, 100, &rand); + // Select online CPU. + cpu = cpumask_next(cpu, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_next(-1, cpu_online_mask); + WARN_ON_ONCE(cpu >= nr_cpu_ids); + // Move to that CPU, if can't do so, retry later. + if (torture_sched_setaffinity(current->pid, cpumask_of(cpu), false)) + continue; + // Preempt at high-ish priority, then reset to normal. + sched_set_fifo(current); + torture_sched_setaffinity(current->pid, cpu_present_mask, true); + mdelay(preempt_duration); + sched_set_normal(current, 0); + stutter_wait("rcu_torture_preempt"); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_preempt"); + return 0; +} + static enum cpuhp_state rcutor_hp; static void @@ -3427,6 +3614,7 @@ rcu_torture_cleanup(void) int flags = 0; unsigned long gp_seq = 0; int i; + int j; if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) { @@ -3446,6 +3634,7 @@ rcu_torture_cleanup(void) if (cur_ops->gp_kthread_dbg) cur_ops->gp_kthread_dbg(); + torture_stop_kthread(rcu_torture_preempt, preempt_task); rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); rcu_torture_fwd_prog_cleanup(); @@ -3470,7 +3659,7 @@ rcu_torture_cleanup(void) rcu_torture_reader_mbchk = NULL; if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) + for (i = 0; i < nrealfakewriters; i++) torture_stop_kthread(rcu_torture_fakewriter, fakewriter_tasks[i]); kfree(fakewriter_tasks); @@ -3508,26 +3697,74 @@ rcu_torture_cleanup(void) pr_alert("\t: No segments recorded!!!\n"); firsttime = 1; for (i = 0; i < rt_read_nsegs; i++) { - pr_alert("\t%d: %#x ", i, err_segs[i].rt_readstate); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP)) + pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL)); + else + pr_alert("\t"); + pr_cont("%d: %#4x", i, err_segs[i].rt_readstate); if (err_segs[i].rt_delay_jiffies != 0) { pr_cont("%s%ldjiffies", firsttime ? "" : "+", err_segs[i].rt_delay_jiffies); firsttime = 0; } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { + pr_cont(" CPU %2d", err_segs[i].rt_cpu); + if (err_segs[i].rt_cpu != err_segs[i].rt_end_cpu) + pr_cont("->%-2d", err_segs[i].rt_end_cpu); + else + pr_cont(" ..."); + } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && + cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) { + char buf1[20+1]; + char buf2[20+1]; + char sepchar = '-'; + + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq, + buf1, ARRAY_SIZE(buf1)); + cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end, + buf2, ARRAY_SIZE(buf2)); + if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) { + if (buf2[0]) { + for (j = 0; buf2[j]; j++) + buf2[j] = '.'; + if (j) + buf2[j - 1] = ' '; + } + sepchar = ' '; + } + pr_cont(" %s%c%s", buf1, sepchar, buf2); + } if (err_segs[i].rt_delay_ms != 0) { - pr_cont("%s%ldms", firsttime ? "" : "+", + pr_cont(" %s%ldms", firsttime ? "" : "+", err_segs[i].rt_delay_ms); firsttime = 0; } if (err_segs[i].rt_delay_us != 0) { - pr_cont("%s%ldus", firsttime ? "" : "+", + pr_cont(" %s%ldus", firsttime ? "" : "+", err_segs[i].rt_delay_us); firsttime = 0; } - pr_cont("%s\n", - err_segs[i].rt_preempted ? "preempted" : ""); + pr_cont("%s", err_segs[i].rt_preempted ? " preempted" : ""); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_BH) + pr_cont(" BH"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_IRQ) + pr_cont(" IRQ"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_PREEMPT) + pr_cont(" PREEMPT"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RBH) + pr_cont(" RBH"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_SCHED) + pr_cont(" SCHED"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_1) + pr_cont(" RCU_1"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_2) + pr_cont(" RCU_2"); + pr_cont("\n"); } + if (rt_read_preempted) + pr_alert("\tReader was preempted.\n"); } if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); @@ -3844,6 +4081,14 @@ rcu_torture_init(void) rcu_torture_init_srcu_lockdep(); + if (nfakewriters >= 0) { + nrealfakewriters = nfakewriters; + } else { + nrealfakewriters = num_online_cpus() - 2 - nfakewriters; + if (nrealfakewriters <= 0) + nrealfakewriters = 1; + } + if (nreaders >= 0) { nrealreaders = nreaders; } else { @@ -3900,8 +4145,9 @@ rcu_torture_init(void) writer_task); if (torture_init_error(firsterr)) goto unwind; - if (nfakewriters > 0) { - fakewriter_tasks = kcalloc(nfakewriters, + + if (nrealfakewriters > 0) { + fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { @@ -3910,7 +4156,7 @@ rcu_torture_init(void) goto unwind; } } - for (i = 0; i < nfakewriters; i++) { + for (i = 0; i < nrealfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, NULL, fakewriter_tasks[i]); if (torture_init_error(firsterr)) @@ -4019,6 +4265,11 @@ rcu_torture_init(void) firsterr = rcu_torture_read_exit_init(); if (torture_init_error(firsterr)) goto unwind; + if (preempt_duration > 0) { + firsterr = torture_create_kthread(rcu_torture_preempt, NULL, preempt_task); + if (torture_init_error(firsterr)) + goto unwind; + } if (object_debug) rcu_test_debug_objects(); torture_init_end(); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index aacfcc9838b3..f11a7c2af778 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -36,6 +36,7 @@ #include <linux/slab.h> #include <linux/torture.h> #include <linux/types.h> +#include <linux/sched/clock.h> #include "rcu.h" @@ -215,6 +216,36 @@ static const struct ref_scale_ops srcu_ops = { .name = "srcu" }; +static void srcu_fast_ref_scale_read_section(const int nloops) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl) +{ + int i; + struct srcu_ctr __percpu *scp; + + for (i = nloops; i >= 0; i--) { + scp = srcu_read_lock_fast(srcu_ctlp); + un_delay(udl, ndl); + srcu_read_unlock_fast(srcu_ctlp, scp); + } +} + +static const struct ref_scale_ops srcu_fast_ops = { + .init = rcu_sync_scale_init, + .readsection = srcu_fast_ref_scale_read_section, + .delaysection = srcu_fast_ref_scale_delay_section, + .name = "srcu-fast" +}; + static void srcu_lite_ref_scale_read_section(const int nloops) { int i; @@ -531,6 +562,39 @@ static const struct ref_scale_ops acqrel_ops = { static volatile u64 stopopts; +static void ref_sched_clock_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += sched_clock(); + preempt_enable(); + stopopts = x; +} + +static void ref_sched_clock_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += sched_clock(); + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static const struct ref_scale_ops sched_clock_ops = { + .readsection = ref_sched_clock_section, + .delaysection = ref_sched_clock_delay_section, + .name = "sched-clock" +}; + + static void ref_clock_section(const int nloops) { u64 x = 0; @@ -1129,10 +1193,10 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS - &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, - &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, - &typesafe_seqlock_ops, + &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, + &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, + &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, }; if (!torture_init_begin(scale_type, verbose)) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 4dcbf8aa80ff..6e9fe2ce1075 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -20,7 +20,11 @@ #include "rcu_segcblist.h" #include "rcu.h" +#ifndef CONFIG_TREE_RCU int rcu_scheduler_active __read_mostly; +#else // #ifndef CONFIG_TREE_RCU +extern int rcu_scheduler_active; +#endif // #else // #ifndef CONFIG_TREE_RCU static LIST_HEAD(srcu_boot_list); static bool srcu_init_done; @@ -98,7 +102,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { int newval; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); preempt_enable(); @@ -120,7 +124,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { preempt_enable(); return; /* Already running or nothing to do. */ @@ -138,7 +142,7 @@ void srcu_drive_gp(struct work_struct *wp) WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ preempt_enable(); swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); preempt_enable(); @@ -159,7 +163,7 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY WRITE_ONCE(ssp->srcu_gp_running, false); idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)); preempt_enable(); @@ -172,7 +176,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY cookie = get_state_synchronize_srcu(ssp); if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { preempt_enable(); @@ -199,7 +203,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY local_irq_save(flags); *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; @@ -261,7 +265,7 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) { unsigned long ret; - preempt_disable(); // Needed for PREEMPT_AUTO + preempt_disable(); // Needed for PREEMPT_LAZY ret = get_state_synchronize_srcu(ssp); srcu_gp_start_if_needed(ssp); preempt_enable(); @@ -282,11 +286,13 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) } EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); +#ifndef CONFIG_TREE_RCU /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { rcu_scheduler_active = RCU_SCHEDULER_RUNNING; } +#endif // #ifndef CONFIG_TREE_RCU /* * Queue work for srcu_struct structures with early boot callbacks. diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5e2e53464794..d2a694944553 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -116,8 +116,9 @@ do { \ /* * Initialize SRCU per-CPU data. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and - * srcu_read_unlock() running against them. So if the is_static parameter - * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + * srcu_read_unlock() running against them. So if the is_static + * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and + * ->srcu_ctrs[].srcu_unlocks. */ static void init_srcu_struct_data(struct srcu_struct *ssp) { @@ -128,8 +129,6 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) * Initialize the per-CPU srcu_data array, which feeds into the * leaves of the srcu_node tree. */ - BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) != - ARRAY_SIZE(sdp->srcu_unlock_count)); for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(ssp->sda, cpu); spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); @@ -247,15 +246,16 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->node = NULL; mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_sup->srcu_gp_mutex); - ssp->srcu_idx = 0; ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); ssp->srcu_sup->sda_is_static = is_static; - if (!is_static) + if (!is_static) { ssp->sda = alloc_percpu(struct srcu_data); + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; + } if (!ssp->sda) goto err_free_sup; init_srcu_struct_data(ssp); @@ -429,10 +429,10 @@ static bool srcu_gp_is_expedited(struct srcu_struct *ssp) } /* - * Computes approximate total of the readers' ->srcu_lock_count[] values - * for the rank of per-CPU counters specified by idx, and returns true if - * the caller did the proper barrier (gp), and if the count of the locks - * matches that of the unlocks passed in. + * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks + * values for the rank of per-CPU counters specified by idx, and returns + * true if the caller did the proper barrier (gp), and if the count of + * the locks matches that of the unlocks passed in. */ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) { @@ -443,20 +443,20 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks); if (IS_ENABLED(CONFIG_PROVE_RCU)) mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), "Mixed reader flavors for srcu_struct at %ps.\n", ssp); - if (mask & SRCU_READ_FLAVOR_LITE && !gp) + if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp) return false; return sum == unlocks; } /* - * Returns approximate total of the readers' ->srcu_unlock_count[] values - * for the rank of per-CPU counters specified by idx. + * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks + * values for the rank of per-CPU counters specified by idx. */ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) { @@ -467,7 +467,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, u for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); + sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks); mask = mask | READ_ONCE(sdp->srcu_reader_flavor); } WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), @@ -487,7 +487,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) unsigned long unlocks; unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); - did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE); + did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP); /* * Make sure that a lock is always counted if the corresponding @@ -509,48 +509,49 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) * If the locks are the same as the unlocks, then there must have * been no readers on this index at some point in this function. * But there might be more readers, as a task might have read - * the current ->srcu_idx but not yet have incremented its CPU's - * ->srcu_lock_count[idx] counter. In fact, it is possible + * the current ->srcu_ctrp but not yet have incremented its CPU's + * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible * that most of the tasks have been preempted between fetching - * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there - * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks - * in a system whose address space was fully populated with memory. - * Call this quantity Nt. + * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And + * there could be almost (ULONG_MAX / sizeof(struct task_struct)) + * tasks in a system whose address space was fully populated + * with memory. Call this quantity Nt. * - * So suppose that the updater is preempted at this point in the - * code for a long time. That now-preempted updater has already - * flipped ->srcu_idx (possibly during the preceding grace period), - * done an smp_mb() (again, possibly during the preceding grace - * period), and summed up the ->srcu_unlock_count[idx] counters. - * How many times can a given one of the aforementioned Nt tasks - * increment the old ->srcu_idx value's ->srcu_lock_count[idx] - * counter, in the absence of nesting? + * So suppose that the updater is preempted at this + * point in the code for a long time. That now-preempted + * updater has already flipped ->srcu_ctrp (possibly during + * the preceding grace period), done an smp_mb() (again, + * possibly during the preceding grace period), and summed up + * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times + * can a given one of the aforementioned Nt tasks increment the + * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter, + * in the absence of nesting? * * It can clearly do so once, given that it has already fetched - * the old value of ->srcu_idx and is just about to use that value - * to index its increment of ->srcu_lock_count[idx]. But as soon as - * it leaves that SRCU read-side critical section, it will increment - * ->srcu_unlock_count[idx], which must follow the updater's above - * read from that same value. Thus, as soon the reading task does - * an smp_mb() and a later fetch from ->srcu_idx, that task will be - * guaranteed to get the new index. Except that the increment of - * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the - * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock() - * is before the smp_mb(). Thus, that task might not see the new - * value of ->srcu_idx until the -second- __srcu_read_lock(), - * which in turn means that this task might well increment - * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice, - * not just once. + * the old value of ->srcu_ctrp and is just about to use that + * value to index its increment of ->srcu_ctrs[idx].srcu_locks. + * But as soon as it leaves that SRCU read-side critical section, + * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must + * follow the updater's above read from that same value. Thus, + as soon the reading task does an smp_mb() and a later fetch from + * ->srcu_ctrp, that task will be guaranteed to get the new index. + * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks + * in __srcu_read_unlock() is after the smp_mb(), and the fetch + * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb(). + * Thus, that task might not see the new value of ->srcu_ctrp until + * the -second- __srcu_read_lock(), which in turn means that this + * task might well increment ->srcu_ctrs[idx].srcu_locks for the + * old value of ->srcu_ctrp twice, not just once. * * However, it is important to note that a given smp_mb() takes * effect not just for the task executing it, but also for any * later task running on that same CPU. * - * That is, there can be almost Nt + Nc further increments of - * ->srcu_lock_count[idx] for the old index, where Nc is the number - * of CPUs. But this is OK because the size of the task_struct - * structure limits the value of Nt and current systems limit Nc - * to a few thousand. + * That is, there can be almost Nt + Nc further increments + * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc + * is the number of CPUs. But this is OK because the size of + * the task_struct structure limits the value of Nt and current + * systems limit Nc to a few thousand. * * OK, but what about nesting? This does impose a limit on * nesting of half of the size of the task_struct structure @@ -581,10 +582,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp) for_each_possible_cpu(cpu) { struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); - sum += atomic_long_read(&sdp->srcu_lock_count[0]); - sum += atomic_long_read(&sdp->srcu_lock_count[1]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[0]); - sum -= atomic_long_read(&sdp->srcu_unlock_count[1]); + sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks); + sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks); + sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks); + sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks); } return sum; } @@ -647,6 +648,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) unsigned long jbase = SRCU_INTERVAL; struct srcu_usage *sup = ssp->srcu_sup; + lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); if (srcu_gp_is_expedited(ssp)) jbase = 0; if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { @@ -674,9 +676,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; + unsigned long delay; struct srcu_usage *sup = ssp->srcu_sup; - if (WARN_ON(!srcu_get_delay(ssp))) + spin_lock_irq_rcu_node(ssp->srcu_sup); + delay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); + if (WARN_ON(!delay)) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ @@ -738,16 +744,16 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. - * Returns an index that must be passed to the matching srcu_read_unlock(). + * Returns a guaranteed non-negative index that must be passed to the + * matching __srcu_read_unlock(). */ int __srcu_read_lock(struct srcu_struct *ssp) { - int idx; + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); + this_cpu_inc(scp->srcu_locks.counter); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scp); } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -759,7 +765,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); + this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -772,13 +778,12 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); */ int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) { - int idx; - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); + struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp); + struct srcu_ctr *scp = raw_cpu_ptr(scpp); - idx = READ_ONCE(ssp->srcu_idx) & 0x1; - atomic_long_inc(&sdp->srcu_lock_count[idx]); + atomic_long_inc(&scp->srcu_locks); smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ - return idx; + return __srcu_ptr_to_ctr(ssp, scpp); } EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); @@ -789,10 +794,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); */ void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ - atomic_long_inc(&sdp->srcu_unlock_count[idx]); + atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks); } EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); @@ -1076,7 +1079,6 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* If grace period not already in progress, start it. */ if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) { - WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed)); srcu_gp_start(ssp); // And how can that list_add() in the "else" clause @@ -1096,13 +1098,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* * Wait until all readers counted by array index idx complete, but * loop an additional time if there is an expedited grace period pending. - * The caller must ensure that ->srcu_idx is not changed while checking. + * The caller must ensure that ->srcu_ctrp is not changed while checking. */ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) { unsigned long curdelay; + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = !srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); for (;;) { if (srcu_readers_active_idx_check(ssp, idx)) @@ -1114,30 +1118,30 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) } /* - * Increment the ->srcu_idx counter so that future SRCU readers will + * Increment the ->srcu_ctrp counter so that future SRCU readers will * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *ssp) { /* - * Because the flip of ->srcu_idx is executed only if the + * Because the flip of ->srcu_ctrp is executed only if the * preceding call to srcu_readers_active_idx_check() found that - * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched - * and because that summing uses atomic_long_read(), there is - * ordering due to a control dependency between that summing and - * the WRITE_ONCE() in this call to srcu_flip(). This ordering - * ensures that if this updater saw a given reader's increment from - * __srcu_read_lock(), that reader was using a value of ->srcu_idx - * from before the previous call to srcu_flip(), which should be - * quite rare. This ordering thus helps forward progress because - * the grace period could otherwise be delayed by additional - * calls to __srcu_read_lock() using that old (soon to be new) - * value of ->srcu_idx. + * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums + * matched and because that summing uses atomic_long_read(), + * there is ordering due to a control dependency between that + * summing and the WRITE_ONCE() in this call to srcu_flip(). + * This ordering ensures that if this updater saw a given reader's + * increment from __srcu_read_lock(), that reader was using a value + * of ->srcu_ctrp from before the previous call to srcu_flip(), + * which should be quite rare. This ordering thus helps forward + * progress because the grace period could otherwise be delayed + * by additional calls to __srcu_read_lock() using that old (soon + * to be new) value of ->srcu_ctrp. * * This sum-equality check and ordering also ensures that if * a given call to __srcu_read_lock() uses the new value of - * ->srcu_idx, this updater's earlier scans cannot have seen + * ->srcu_ctrp, this updater's earlier scans cannot have seen * that reader's increments, which is all to the good, because * this grace period need not wait on that reader. After all, * if those earlier scans had seen that reader, there would have @@ -1152,7 +1156,8 @@ static void srcu_flip(struct srcu_struct *ssp) */ smp_mb(); /* E */ /* Pairs with B and C. */ - WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. + WRITE_ONCE(ssp->srcu_ctrp, + &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]); /* * Ensure that if the updater misses an __srcu_read_unlock() @@ -1198,7 +1203,7 @@ static bool srcu_should_expedite(struct srcu_struct *ssp) check_init_srcu_struct(ssp); /* If _lite() readers, don't do unsolicited expediting. */ - if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE) + if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP) return false; /* If the local srcu_data structure has callbacks, not idle. */ sdp = raw_cpu_ptr(ssp->sda); @@ -1398,8 +1403,12 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, * read-side critical sections are delimited by srcu_read_lock() and * srcu_read_unlock(), and may be nested. * - * The callback will be invoked from process context, but must nevertheless - * be fast and must not block. + * The callback will be invoked from process context, but with bh + * disabled. The callback function must therefore be fast and must + * not block. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. */ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func) @@ -1465,8 +1474,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * * Wait for the count to drain to zero of both indexes. To avoid the * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, - * and then flip the srcu_idx and wait for the count of the other index. + * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero + * at first, and then flip the ->srcu_ctrp and wait for the count of the + * other index. * * Can block; must be called from process context. * @@ -1675,7 +1685,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier); */ unsigned long srcu_batches_completed(struct srcu_struct *ssp) { - return READ_ONCE(ssp->srcu_idx); + return READ_ONCE(ssp->srcu_sup->srcu_gp_seq); } EXPORT_SYMBOL_GPL(srcu_batches_completed); @@ -1692,7 +1702,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) /* * Because readers might be delayed for an extended period after - * fetching ->srcu_idx for their index, at any point in time there + * fetching ->srcu_ctrp for their index, at any point in time there * might well be readers using both idx=0 and idx=1. We therefore * need to wait for readers to clear from both index values before * invoking a callback. @@ -1720,7 +1730,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) } if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 1)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1738,7 +1748,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) * SRCU read-side critical sections are normally short, * so check at least twice in quick succession after a flip. */ - idx = 1 ^ (ssp->srcu_idx & 1); + idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]); if (!try_check_zero(ssp, idx, 2)) { mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); return; /* readers present, retry later. */ @@ -1849,7 +1859,9 @@ static void process_srcu(struct work_struct *work) ssp = sup->srcu_ssp; srcu_advance_state(ssp); + spin_lock_irq_rcu_node(ssp->srcu_sup); curdelay = srcu_get_delay(ssp); + spin_unlock_irq_rcu_node(ssp->srcu_sup); if (curdelay) { WRITE_ONCE(sup->reschedule_count, 0); } else { @@ -1896,7 +1908,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); int ss_state_idx = ss_state; - idx = ssp->srcu_idx & 0x1; + idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]; if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; pr_alert("%s%s Tree SRCU g%ld state %d (%s)", @@ -1914,8 +1926,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx])); - u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx])); + u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks)); + u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks)); /* * Make sure that a lock is always counted if the corresponding @@ -1923,8 +1935,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx])); - l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx])); + l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks)); + l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks)); c0 = l0 - u0; c1 = l1 - u1; @@ -2001,6 +2013,7 @@ static int srcu_module_coming(struct module *mod) ssp->sda = alloc_percpu(struct srcu_data); if (WARN_ON_ONCE(!ssp->sda)) return -ENOMEM; + ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0]; } return 0; } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 59314da5eb60..466668eb4fad 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -2256,7 +2256,7 @@ void __init tasks_cblist_init_generic(void) #endif } -void __init rcu_init_tasks_generic(void) +static int __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU rcu_spawn_tasks_kthread(); @@ -2272,7 +2272,10 @@ void __init rcu_init_tasks_generic(void) // Run the self-tests. rcu_tasks_initiate_self_tests(); + + return 0; } +core_initcall(rcu_init_tasks_generic); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index b3b3ce34df63..c1ebfd51768b 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -85,15 +85,8 @@ void rcu_sched_clock_irq(int user) static inline bool rcu_reclaim_tiny(struct rcu_head *head) { rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kvfree_rcu_offset(offset)) { - trace_rcu_invoke_kvfree_callback("", head, offset); - kvfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } trace_rcu_invoke_callback("", head); f = head->func; @@ -159,10 +152,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -static void tiny_rcu_leak_callback(struct rcu_head *rhp) -{ -} - /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -178,9 +167,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); mem_dump_obj(head); } - - if (!__is_kvfree_rcu_offset((unsigned long)head->func)) - WRITE_ONCE(head->func, tiny_rcu_leak_callback); return; } @@ -246,15 +232,18 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr) +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) +unsigned long long rcutorture_gather_gp_seqs(void) { - if (head) - kasan_record_aux_stack_noalloc(ptr); + return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL; +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); - __kvfree_call_rcu(head, ptr); +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + snprintf(cp, len, "g%04llx", seqs & 0xffffULL); } -EXPORT_SYMBOL_GPL(kvfree_call_rcu); +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); #endif void __init rcu_init(void) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ff98233d4aa5..659f83e71048 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -149,7 +149,6 @@ static int rcu_scheduler_fully_active __read_mostly; static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); -static struct task_struct *rcu_boost_task(struct rcu_node *rnp); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); @@ -186,26 +185,6 @@ static int rcu_unlock_delay; module_param(rcu_unlock_delay, int, 0444); #endif -/* - * This rcu parameter is runtime-read-only. It reflects - * a minimum allowed number of objects which can be cached - * per-CPU. Object size is equal to one page. This value - * can be changed at boot time. - */ -static int rcu_min_cached_objs = 5; -module_param(rcu_min_cached_objs, int, 0444); - -// A page shrinker can ask for pages to be freed to make them -// available for other parts of the system. This usually happens -// under low memory conditions, and in that case we should also -// defer page-cache filling for a short time period. -// -// The default value is 5 seconds, which is long enough to reduce -// interference with the shrinker while it asks other systems to -// drain their caches. -static int rcu_delay_page_cache_fill_msec = 5000; -module_param(rcu_delay_page_cache_fill_msec, int, 0444); - /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -559,6 +538,26 @@ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); +/* Gather grace-period sequence numbers for rcutorture diagnostics. */ +unsigned long long rcutorture_gather_gp_seqs(void) +{ + return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) | + ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) | + (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL); +} +EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs); + +/* Format grace-period sequence numbers for rcutorture diagnostics. */ +void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) +{ + unsigned int egp = (seqs >> 16) & 0xffffffULL; + unsigned int ggp = (seqs >> 40) & 0xffffULL; + unsigned int pgp = seqs & 0xffffULL; + + snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp); +} +EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); + #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on @@ -1275,7 +1274,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Handle the ends of any preceding grace periods first. */ if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { if (!offloaded) ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ rdp->core_needs_qs = false; @@ -1289,7 +1288,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) /* Now handle the beginnings of any new-to-this-CPU grace periods. */ if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || - unlikely(READ_ONCE(rdp->gpwrap))) { + unlikely(rdp->gpwrap)) { /* * If the current grace period is waiting for this CPU, * set up to detect a quiescent state, otherwise don't @@ -1304,7 +1303,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); - if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap)) + if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap) WRITE_ONCE(rdp->last_sched_clock, jiffies); WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); @@ -1633,12 +1632,10 @@ static void rcu_sr_normal_complete(struct llist_node *node) { struct rcu_synchronize *rs = container_of( (struct rcu_head *) node, struct rcu_synchronize, head); - unsigned long oldstate = (unsigned long) rs->head.func; WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && - !poll_state_synchronize_rcu(oldstate), - "A full grace period is not passed yet: %lu", - rcu_seq_diff(get_state_synchronize_rcu(), oldstate)); + !poll_state_synchronize_rcu_full(&rs->oldstate), + "A full grace period is not passed yet!\n"); /* Finally. */ complete(&rs->completion); @@ -1822,10 +1819,14 @@ static noinline_for_stack bool rcu_gp_init(void) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(); + /* + * A new wait segment must be started before gp_seq advanced, so + * that previous gp waiters won't observe the new gp_seq. + */ + start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); - start_new_poll = rcu_sr_normal_gp_init(); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); @@ -2952,13 +2953,8 @@ static int __init rcu_spawn_core_kthreads(void) static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) { rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kvfree_rcu_offset((unsigned long)func)) - trace_rcu_kvfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_cbs(&rdp->cblist)); - else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); } @@ -3083,9 +3079,12 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) } head->func = func; head->next = NULL; - kasan_record_aux_stack_noalloc(head); + kasan_record_aux_stack(head); + local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); + RCU_LOCKDEP_WARN(!rcu_rdp_cpu_online(rdp), "Callback enqueued on offline CPU!"); + lazy = lazy_in && !rcu_async_should_hurry(); /* Add the callback to our list. */ @@ -3125,7 +3124,7 @@ module_param(enable_rcu_lazy, bool, 0444); * critical sections have completed. * * Use this API instead of call_rcu() if you don't want the callback to be - * invoked after very long periods of time, which can happen on systems without + * delayed for very long periods of time, which can happen on systems without * memory pressure and on systems which are lightly loaded or mostly idle. * This function will cause callbacks to be invoked sooner than later at the * expense of extra power. Other than that, this function is identical to, and @@ -3156,6 +3155,12 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * might well execute concurrently with RCU read-side critical sections * that started after call_rcu() was invoked. * + * It is perfectly legal to repost an RCU callback, potentially with + * a different callback function, from within its callback function. + * The specified function will be invoked after another full grace period + * has elapsed. This use case is similar in form to the common practice + * of reposting a timer from within its own handler. + * * RCU read-side critical sections are delimited by rcu_read_lock() * and rcu_read_unlock(), and may be nested. In addition, but only in * v5.0 and later, regions of code across which interrupts, preemption, @@ -3184,6 +3189,13 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); * * Implementation of these memory-ordering guarantees is described here: * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. + * + * Specific to call_rcu() (as opposed to the other call_rcu*() functions), + * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many + * seconds before starting the grace period needed by the corresponding + * callback. This delay can significantly improve energy-efficiency + * on low-utilization battery-powered devices. To avoid this delay, + * in latency-sensitive kernel code, use call_rcu_hurry(). */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { @@ -3191,812 +3203,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu); -/* Maximum number of jiffies to wait before draining a batch. */ -#define KFREE_DRAIN_JIFFIES (5 * HZ) -#define KFREE_N_BATCHES 2 -#define FREE_N_CHANNELS 2 - -/** - * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers - * @list: List node. All blocks are linked between each other - * @gp_snap: Snapshot of RCU state for objects placed to this bulk - * @nr_records: Number of active pointers in the array - * @records: Array of the kvfree_rcu() pointers - */ -struct kvfree_rcu_bulk_data { - struct list_head list; - struct rcu_gp_oldstate gp_snap; - unsigned long nr_records; - void *records[] __counted_by(nr_records); -}; - -/* - * This macro defines how many entries the "records" array - * will contain. It is based on the fact that the size of - * kvfree_rcu_bulk_data structure becomes exactly one page. - */ -#define KVFREE_BULK_MAX_ENTR \ - ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) - -/** - * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests - * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period - * @head_free: List of kfree_rcu() objects waiting for a grace period - * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. - * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period - * @krcp: Pointer to @kfree_rcu_cpu structure - */ - -struct kfree_rcu_cpu_work { - struct rcu_work rcu_work; - struct rcu_head *head_free; - struct rcu_gp_oldstate head_free_gp_snap; - struct list_head bulk_head_free[FREE_N_CHANNELS]; - struct kfree_rcu_cpu *krcp; -}; - -/** - * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period - * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" - * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period - * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period - * @lock: Synchronize access to this structure - * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES - * @initialized: The @rcu_work fields have been initialized - * @head_count: Number of objects in rcu_head singular list - * @bulk_count: Number of objects in bulk-list - * @bkvcache: - * A simple cache list that contains objects for reuse purpose. - * In order to save some per-cpu space the list is singular. - * Even though it is lockless an access has to be protected by the - * per-cpu lock. - * @page_cache_work: A work to refill the cache when it is empty - * @backoff_page_cache_fill: Delay cache refills - * @work_in_progress: Indicates that page_cache_work is running - * @hrtimer: A hrtimer for scheduling a page_cache_work - * @nr_bkv_objs: number of allocated objects at @bkvcache. - * - * This is a per-CPU structure. The reason that it is not included in - * the rcu_data structure is to permit this code to be extracted from - * the RCU files. Such extraction could allow further optimization of - * the interactions with the slab allocators. - */ -struct kfree_rcu_cpu { - // Objects queued on a linked list - // through their rcu_head structures. - struct rcu_head *head; - unsigned long head_gp_snap; - atomic_t head_count; - - // Objects queued on a bulk-list. - struct list_head bulk_head[FREE_N_CHANNELS]; - atomic_t bulk_count[FREE_N_CHANNELS]; - - struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; - raw_spinlock_t lock; - struct delayed_work monitor_work; - bool initialized; - - struct delayed_work page_cache_work; - atomic_t backoff_page_cache_fill; - atomic_t work_in_progress; - struct hrtimer hrtimer; - - struct llist_head bkvcache; - int nr_bkv_objs; -}; - -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), -}; - -static __always_inline void -debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) -{ -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - int i; - - for (i = 0; i < bhead->nr_records; i++) - debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); -#endif -} - -static inline struct kfree_rcu_cpu * -krc_this_cpu_lock(unsigned long *flags) -{ - struct kfree_rcu_cpu *krcp; - - local_irq_save(*flags); // For safely calling this_cpu_ptr(). - krcp = this_cpu_ptr(&krc); - raw_spin_lock(&krcp->lock); - - return krcp; -} - -static inline void -krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) -{ - raw_spin_unlock_irqrestore(&krcp->lock, flags); -} - -static inline struct kvfree_rcu_bulk_data * -get_cached_bnode(struct kfree_rcu_cpu *krcp) -{ - if (!krcp->nr_bkv_objs) - return NULL; - - WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1); - return (struct kvfree_rcu_bulk_data *) - llist_del_first(&krcp->bkvcache); -} - -static inline bool -put_cached_bnode(struct kfree_rcu_cpu *krcp, - struct kvfree_rcu_bulk_data *bnode) -{ - // Check the limit. - if (krcp->nr_bkv_objs >= rcu_min_cached_objs) - return false; - - llist_add((struct llist_node *) bnode, &krcp->bkvcache); - WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1); - return true; -} - -static int -drain_page_cache(struct kfree_rcu_cpu *krcp) -{ - unsigned long flags; - struct llist_node *page_list, *pos, *n; - int freed = 0; - - if (!rcu_min_cached_objs) - return 0; - - raw_spin_lock_irqsave(&krcp->lock, flags); - page_list = llist_del_all(&krcp->bkvcache); - WRITE_ONCE(krcp->nr_bkv_objs, 0); - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - llist_for_each_safe(pos, n, page_list) { - free_page((unsigned long)pos); - freed++; - } - - return freed; -} - -static void -kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, - struct kvfree_rcu_bulk_data *bnode, int idx) -{ - unsigned long flags; - int i; - - if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { - debug_rcu_bhead_unqueue(bnode); - rcu_lock_acquire(&rcu_callback_map); - if (idx == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bnode->nr_records, - bnode->records); - - kfree_bulk(bnode->nr_records, bnode->records); - } else { // vmalloc() / vfree(). - for (i = 0; i < bnode->nr_records; i++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, bnode->records[i], 0); - - vfree(bnode->records[i]); - } - } - rcu_lock_release(&rcu_callback_map); - } - - raw_spin_lock_irqsave(&krcp->lock, flags); - if (put_cached_bnode(krcp, bnode)) - bnode = NULL; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (bnode) - free_page((unsigned long) bnode); - - cond_resched_tasks_rcu_qs(); -} - -static void -kvfree_rcu_list(struct rcu_head *head) -{ - struct rcu_head *next; - - for (; head; head = next) { - void *ptr = (void *) head->func; - unsigned long offset = (void *) head - ptr; - - next = head->next; - debug_rcu_head_unqueue((struct rcu_head *)ptr); - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); - - rcu_lock_release(&rcu_callback_map); - cond_resched_tasks_rcu_qs(); - } -} - -/* - * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->bulk_head_free or ->head_free. - */ -static void kfree_rcu_work(struct work_struct *work) -{ - unsigned long flags; - struct kvfree_rcu_bulk_data *bnode, *n; - struct list_head bulk_head[FREE_N_CHANNELS]; - struct rcu_head *head; - struct kfree_rcu_cpu *krcp; - struct kfree_rcu_cpu_work *krwp; - struct rcu_gp_oldstate head_gp_snap; - int i; - - krwp = container_of(to_rcu_work(work), - struct kfree_rcu_cpu_work, rcu_work); - krcp = krwp->krcp; - - raw_spin_lock_irqsave(&krcp->lock, flags); - // Channels 1 and 2. - for (i = 0; i < FREE_N_CHANNELS; i++) - list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]); - - // Channel 3. - head = krwp->head_free; - krwp->head_free = NULL; - head_gp_snap = krwp->head_free_gp_snap; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - // Handle the first two channels. - for (i = 0; i < FREE_N_CHANNELS; i++) { - // Start from the tail page, so a GP is likely passed for it. - list_for_each_entry_safe(bnode, n, &bulk_head[i], list) - kvfree_rcu_bulk(krcp, bnode, i); - } - - /* - * This is used when the "bulk" path can not be used for the - * double-argument of kvfree_rcu(). This happens when the - * page-cache is empty, which means that objects are instead - * queued on a linked list through their rcu_head structures. - * This list is named "Channel 3". - */ - if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) - kvfree_rcu_list(head); -} - -static bool -need_offload_krc(struct kfree_rcu_cpu *krcp) -{ - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - if (!list_empty(&krcp->bulk_head[i])) - return true; - - return !!READ_ONCE(krcp->head); -} - -static bool -need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp) -{ - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - if (!list_empty(&krwp->bulk_head_free[i])) - return true; - - return !!krwp->head_free; -} - -static int krc_count(struct kfree_rcu_cpu *krcp) -{ - int sum = atomic_read(&krcp->head_count); - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - sum += atomic_read(&krcp->bulk_count[i]); - - return sum; -} - -static void -__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) -{ - long delay, delay_left; - - delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; - if (delayed_work_pending(&krcp->monitor_work)) { - delay_left = krcp->monitor_work.timer.expires - jiffies; - if (delay < delay_left) - mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); - return; - } - queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); -} - -static void -schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&krcp->lock, flags); - __schedule_delayed_monitor_work(krcp); - raw_spin_unlock_irqrestore(&krcp->lock, flags); -} - -static void -kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) -{ - struct list_head bulk_ready[FREE_N_CHANNELS]; - struct kvfree_rcu_bulk_data *bnode, *n; - struct rcu_head *head_ready = NULL; - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&krcp->lock, flags); - for (i = 0; i < FREE_N_CHANNELS; i++) { - INIT_LIST_HEAD(&bulk_ready[i]); - - list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { - if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) - break; - - atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); - list_move(&bnode->list, &bulk_ready[i]); - } - } - - if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { - head_ready = krcp->head; - atomic_set(&krcp->head_count, 0); - WRITE_ONCE(krcp->head, NULL); - } - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - for (i = 0; i < FREE_N_CHANNELS; i++) { - list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) - kvfree_rcu_bulk(krcp, bnode, i); - } - - if (head_ready) - kvfree_rcu_list(head_ready); -} - -/* - * Return: %true if a work is queued, %false otherwise. - */ -static bool -kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) -{ - unsigned long flags; - bool queued = false; - int i, j; - - raw_spin_lock_irqsave(&krcp->lock, flags); - - // Attempt to start a new batch. - for (i = 0; i < KFREE_N_BATCHES; i++) { - struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); - - // Try to detach bulk_head or head and attach it, only when - // all channels are free. Any channel is not free means at krwp - // there is on-going rcu work to handle krwp's free business. - if (need_wait_for_krwp_work(krwp)) - continue; - - // kvfree_rcu_drain_ready() might handle this krcp, if so give up. - if (need_offload_krc(krcp)) { - // Channel 1 corresponds to the SLAB-pointer bulk path. - // Channel 2 corresponds to vmalloc-pointer bulk path. - for (j = 0; j < FREE_N_CHANNELS; j++) { - if (list_empty(&krwp->bulk_head_free[j])) { - atomic_set(&krcp->bulk_count[j], 0); - list_replace_init(&krcp->bulk_head[j], - &krwp->bulk_head_free[j]); - } - } - - // Channel 3 corresponds to both SLAB and vmalloc - // objects queued on the linked list. - if (!krwp->head_free) { - krwp->head_free = krcp->head; - get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); - atomic_set(&krcp->head_count, 0); - WRITE_ONCE(krcp->head, NULL); - } - - // One work is per one batch, so there are three - // "free channels", the batch can handle. Break - // the loop since it is done with this CPU thus - // queuing an RCU work is _always_ success here. - queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); - WARN_ON_ONCE(!queued); - break; - } - } - - raw_spin_unlock_irqrestore(&krcp->lock, flags); - return queued; -} - -/* - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. - */ -static void kfree_rcu_monitor(struct work_struct *work) -{ - struct kfree_rcu_cpu *krcp = container_of(work, - struct kfree_rcu_cpu, monitor_work.work); - - // Drain ready for reclaim. - kvfree_rcu_drain_ready(krcp); - - // Queue a batch for a rest. - kvfree_rcu_queue_batch(krcp); - - // If there is nothing to detach, it means that our job is - // successfully done here. In case of having at least one - // of the channels that is still busy we should rearm the - // work to repeat an attempt. Because previous batches are - // still in progress. - if (need_offload_krc(krcp)) - schedule_delayed_monitor_work(krcp); -} - -static enum hrtimer_restart -schedule_page_work_fn(struct hrtimer *t) -{ - struct kfree_rcu_cpu *krcp = - container_of(t, struct kfree_rcu_cpu, hrtimer); - - queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); - return HRTIMER_NORESTART; -} - -static void fill_page_cache_func(struct work_struct *work) -{ - struct kvfree_rcu_bulk_data *bnode; - struct kfree_rcu_cpu *krcp = - container_of(work, struct kfree_rcu_cpu, - page_cache_work.work); - unsigned long flags; - int nr_pages; - bool pushed; - int i; - - nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? - 1 : rcu_min_cached_objs; - - for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - - if (!bnode) - break; - - raw_spin_lock_irqsave(&krcp->lock, flags); - pushed = put_cached_bnode(krcp, bnode); - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (!pushed) { - free_page((unsigned long) bnode); - break; - } - } - - atomic_set(&krcp->work_in_progress, 0); - atomic_set(&krcp->backoff_page_cache_fill, 0); -} - -static void -run_page_cache_worker(struct kfree_rcu_cpu *krcp) -{ - // If cache disabled, bail out. - if (!rcu_min_cached_objs) - return; - - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !atomic_xchg(&krcp->work_in_progress, 1)) { - if (atomic_read(&krcp->backoff_page_cache_fill)) { - queue_delayed_work(system_unbound_wq, - &krcp->page_cache_work, - msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); - } else { - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - krcp->hrtimer.function = schedule_page_work_fn; - hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); - } - } -} - -// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() -// state specified by flags. If can_alloc is true, the caller must -// be schedulable and not be holding any locks or mutexes that might be -// acquired by the memory allocator or anything that it might invoke. -// Returns true if ptr was successfully recorded, else the caller must -// use a fallback. -static inline bool -add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, - unsigned long *flags, void *ptr, bool can_alloc) -{ - struct kvfree_rcu_bulk_data *bnode; - int idx; - - *krcp = krc_this_cpu_lock(flags); - if (unlikely(!(*krcp)->initialized)) - return false; - - idx = !!is_vmalloc_addr(ptr); - bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], - struct kvfree_rcu_bulk_data, list); - - /* Check if a new block is required. */ - if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) { - bnode = get_cached_bnode(*krcp); - if (!bnode && can_alloc) { - krc_this_cpu_unlock(*krcp, *flags); - - // __GFP_NORETRY - allows a light-weight direct reclaim - // what is OK from minimizing of fallback hitting point of - // view. Apart of that it forbids any OOM invoking what is - // also beneficial since we are about to release memory soon. - // - // __GFP_NOMEMALLOC - prevents from consuming of all the - // memory reserves. Please note we have a fallback path. - // - // __GFP_NOWARN - it is supposed that an allocation can - // be failed under low memory or high memory pressure - // scenarios. - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - raw_spin_lock_irqsave(&(*krcp)->lock, *flags); - } - - if (!bnode) - return false; - - // Initialize the new block and attach it. - bnode->nr_records = 0; - list_add(&bnode->list, &(*krcp)->bulk_head[idx]); - } - - // Finally insert and update the GP for this page. - bnode->nr_records++; - bnode->records[bnode->nr_records - 1] = ptr; - get_state_synchronize_rcu_full(&bnode->gp_snap); - atomic_inc(&(*krcp)->bulk_count[idx]); - - return true; -} - -/* - * Queue a request for lazy invocation of the appropriate free routine - * after a grace period. Please note that three paths are maintained, - * two for the common case using arrays of pointers and a third one that - * is used only when the main paths cannot be used, for example, due to - * memory pressure. - * - * Each kvfree_call_rcu() request is added to a batch. The batch will be drained - * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will - * be free'd in workqueue context. This allows us to: batch requests together to - * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. - */ -void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - unsigned long flags; - struct kfree_rcu_cpu *krcp; - bool success; - - /* - * Please note there is a limitation for the head-less - * variant, that is why there is a clear rule for such - * objects: it can be used from might_sleep() context - * only. For other places please embed an rcu_head to - * your data. - */ - if (!head) - might_sleep(); - - // Queue the object but don't yet schedule the batch. - if (debug_rcu_head_queue(ptr)) { - // Probable double kfree_rcu(), just leak. - WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", - __func__, head); - - // Mark as success and leave. - return; - } - - kasan_record_aux_stack_noalloc(ptr); - success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); - if (!success) { - run_page_cache_worker(krcp); - - if (head == NULL) - // Inline if kvfree_rcu(one_arg) call. - goto unlock_return; - - head->func = ptr; - head->next = krcp->head; - WRITE_ONCE(krcp->head, head); - atomic_inc(&krcp->head_count); - - // Take a snapshot for this krcp. - krcp->head_gp_snap = get_state_synchronize_rcu(); - success = true; - } - - /* - * The kvfree_rcu() caller considers the pointer freed at this point - * and likely removes any references to it. Since the actual slab - * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore - * this object (no scanning or false positives reporting). - */ - kmemleak_ignore(ptr); - - // Set timer to drain after KFREE_DRAIN_JIFFIES. - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) - __schedule_delayed_monitor_work(krcp); - -unlock_return: - krc_this_cpu_unlock(krcp, flags); - - /* - * Inline kvfree() after synchronize_rcu(). We can do - * it from might_sleep() context only, so the current - * CPU can pass the QS state. - */ - if (!success) { - debug_rcu_head_unqueue((struct rcu_head *) ptr); - synchronize_rcu(); - kvfree(ptr); - } -} -EXPORT_SYMBOL_GPL(kvfree_call_rcu); - -/** - * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. - * - * Note that a single argument of kvfree_rcu() call has a slow path that - * triggers synchronize_rcu() following by freeing a pointer. It is done - * before the return from the function. Therefore for any single-argument - * call that will result in a kfree() to a cache that is to be destroyed - * during module exit, it is developer's responsibility to ensure that all - * such calls have returned before the call to kmem_cache_destroy(). - */ -void kvfree_rcu_barrier(void) -{ - struct kfree_rcu_cpu_work *krwp; - struct kfree_rcu_cpu *krcp; - bool queued; - int i, cpu; - - /* - * Firstly we detach objects and queue them over an RCU-batch - * for all CPUs. Finally queued works are flushed for each CPU. - * - * Please note. If there are outstanding batches for a particular - * CPU, those have to be finished first following by queuing a new. - */ - for_each_possible_cpu(cpu) { - krcp = per_cpu_ptr(&krc, cpu); - - /* - * Check if this CPU has any objects which have been queued for a - * new GP completion. If not(means nothing to detach), we are done - * with it. If any batch is pending/running for this "krcp", below - * per-cpu flush_rcu_work() waits its completion(see last step). - */ - if (!need_offload_krc(krcp)) - continue; - - while (1) { - /* - * If we are not able to queue a new RCU work it means: - * - batches for this CPU are still in flight which should - * be flushed first and then repeat; - * - no objects to detach, because of concurrency. - */ - queued = kvfree_rcu_queue_batch(krcp); - - /* - * Bail out, if there is no need to offload this "krcp" - * anymore. As noted earlier it can run concurrently. - */ - if (queued || !need_offload_krc(krcp)) - break; - - /* There are ongoing batches. */ - for (i = 0; i < KFREE_N_BATCHES; i++) { - krwp = &(krcp->krw_arr[i]); - flush_rcu_work(&krwp->rcu_work); - } - } - } - - /* - * Now we guarantee that all objects are flushed. - */ - for_each_possible_cpu(cpu) { - krcp = per_cpu_ptr(&krc, cpu); - - /* - * A monitor work can drain ready to reclaim objects - * directly. Wait its completion if running or pending. - */ - cancel_delayed_work_sync(&krcp->monitor_work); - - for (i = 0; i < KFREE_N_BATCHES; i++) { - krwp = &(krcp->krw_arr[i]); - flush_rcu_work(&krwp->rcu_work); - } - } -} -EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); - -static unsigned long -kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) -{ - int cpu; - unsigned long count = 0; - - /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - count += krc_count(krcp); - count += READ_ONCE(krcp->nr_bkv_objs); - atomic_set(&krcp->backoff_page_cache_fill, 1); - } - - return count == 0 ? SHRINK_EMPTY : count; -} - -static unsigned long -kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) -{ - int cpu, freed = 0; - - for_each_possible_cpu(cpu) { - int count; - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - count = krc_count(krcp); - count += drain_page_cache(krcp); - kfree_rcu_monitor(&krcp->monitor_work.work); - - sc->nr_to_scan -= count; - freed += count; - - if (sc->nr_to_scan <= 0) - break; - } - - return freed == 0 ? SHRINK_STOP : freed; -} - -void __init kfree_rcu_scheduler_running(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - if (need_offload_krc(krcp)) - schedule_delayed_monitor_work(krcp); - } -} - /* * During early boot, any blocking grace-period wait automatically * implies a grace period. @@ -4038,7 +3244,7 @@ static void synchronize_rcu_normal(void) * snapshot before adding a request. */ if (IS_ENABLED(CONFIG_PROVE_RCU)) - rs.head.func = (void *) get_state_synchronize_rcu(); + get_state_synchronize_rcu_full(&rs.oldstate); rcu_sr_normal_add_req(&rs); @@ -4181,14 +3387,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { - struct rcu_node *rnp = rcu_get_root(); - /* * Any prior manipulation of RCU-protected data must happen * before the loads from ->gp_seq and ->expedited_sequence. */ smp_mb(); /* ^^^ */ - rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq); + + // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use + // in poll_state_synchronize_rcu_full() notwithstanding. Use of + // the latter here would result in too-short grace periods due to + // interactions with newly onlined CPUs. + rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq); rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full); @@ -4895,6 +4104,22 @@ rcu_boot_init_percpu_data(int cpu) rcu_boot_init_nocb_percpu_data(rdp); } +static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp) +{ + cpumask_var_t affinity; + int cpu; + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return; + + for_each_leaf_node_possible_cpu(rnp, cpu) + cpumask_set_cpu(cpu, affinity); + + kthread_affine_preferred(t, affinity); + + free_cpumask_var(affinity); +} + struct kthread_worker *rcu_exp_gp_kworker; static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) @@ -4917,16 +4142,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); -} - -static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) -{ - struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker); - if (!kworker) - return NULL; - - return kworker->task; + rcu_thread_affine_rnp(kworker->task, rnp); + wake_up_process(kworker->task); } static void __init rcu_start_exp_gp_kworker(void) @@ -4934,7 +4152,7 @@ static void __init rcu_start_exp_gp_kworker(void) const char *name = "rcu_exp_gp_kthread_worker"; struct sched_param param = { .sched_priority = kthread_prio }; - rcu_exp_gp_kworker = kthread_create_worker(0, name); + rcu_exp_gp_kworker = kthread_run_worker(0, name); if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { pr_err("Failed to create %s!\n", name); rcu_exp_gp_kworker = NULL; @@ -5012,67 +4230,6 @@ int rcutree_prepare_cpu(unsigned int cpu) } /* - * Update kthreads affinity during CPU-hotplug changes. - * - * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. The CPU hotplug lock is still - * held, so the value of rnp->qsmaskinit will be stable. - * - * We don't include outgoingcpu in the affinity set, use -1 if there is - * no outgoing CPU. If there are no CPUs left in the affinity set, - * this function allows the kthread to execute on any CPU. - * - * Any future concurrent calls are serialized via ->kthread_mutex. - */ -static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) -{ - cpumask_var_t cm; - unsigned long mask; - struct rcu_data *rdp; - struct rcu_node *rnp; - struct task_struct *task_boost, *task_exp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - - task_boost = rcu_boost_task(rnp); - task_exp = rcu_exp_par_gp_task(rnp); - - /* - * If CPU is the boot one, those tasks are created later from early - * initcall since kthreadd must be created first. - */ - if (!task_boost && !task_exp) - return; - - if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) - return; - - mutex_lock(&rnp->kthread_mutex); - mask = rcu_rnp_online_cpus(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) - if ((mask & leaf_node_cpu_bit(rnp, cpu)) && - cpu != outgoingcpu) - cpumask_set_cpu(cpu, cm); - cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (cpumask_empty(cm)) { - cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (outgoingcpu >= 0) - cpumask_clear_cpu(outgoingcpu, cm); - } - - if (task_exp) - set_cpus_allowed_ptr(task_exp, cm); - - if (task_boost) - set_cpus_allowed_ptr(task_boost, cm); - - mutex_unlock(&rnp->kthread_mutex); - - free_cpumask_var(cm); -} - -/* * Has the specified (known valid) CPU ever been fully online? */ bool rcu_cpu_beenfullyonline(int cpu) @@ -5100,7 +4257,6 @@ int rcutree_online_cpu(unsigned int cpu) if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ sync_sched_exp_online_cleanup(cpu); - rcutree_affinity_setting(cpu, -1); // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); @@ -5317,8 +4473,6 @@ int rcutree_offline_cpu(unsigned int cpu) rnp->ffmask &= ~rdp->grpmask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - rcutree_affinity_setting(cpu, cpu); - // nohz_full CPUs need the tick for stop-machine to work quickly tick_dep_set(TICK_DEP_BIT_RCU); return 0; @@ -5648,62 +4802,12 @@ static void __init rcu_dump_rcu_node_tree(void) struct workqueue_struct *rcu_gp_wq; -static void __init kfree_rcu_batch_init(void) -{ - int cpu; - int i, j; - struct shrinker *kfree_rcu_shrinker; - - /* Clamp it to [0:100] seconds interval. */ - if (rcu_delay_page_cache_fill_msec < 0 || - rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { - - rcu_delay_page_cache_fill_msec = - clamp(rcu_delay_page_cache_fill_msec, 0, - (int) (100 * MSEC_PER_SEC)); - - pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n", - rcu_delay_page_cache_fill_msec); - } - - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - for (i = 0; i < KFREE_N_BATCHES; i++) { - INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); - krcp->krw_arr[i].krcp = krcp; - - for (j = 0; j < FREE_N_CHANNELS; j++) - INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]); - } - - for (i = 0; i < FREE_N_CHANNELS; i++) - INIT_LIST_HEAD(&krcp->bulk_head[i]); - - INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); - INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); - krcp->initialized = true; - } - - kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree"); - if (!kfree_rcu_shrinker) { - pr_err("Failed to allocate kfree_rcu() shrinker!\n"); - return; - } - - kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; - kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; - - shrinker_register(kfree_rcu_shrinker); -} - void __init rcu_init(void) { int cpu = smp_processor_id(); rcu_early_boot_tests(); - kfree_rcu_batch_init(); rcu_bootup_announce(); sanitize_kthread_prio(); rcu_init_geometry(); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index fb664d3a01c9..8d4895c854c5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -227,20 +227,22 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. + * specified leaf rcu_node structure, which is acquired by the caller. */ -static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, - unsigned long mask, bool wake) +static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags, + unsigned long mask_in, bool wake) + __releases(rnp->lock) { int cpu; - unsigned long flags; + unsigned long mask; struct rcu_data *rdp; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!(rnp->expmask & mask)) { + raw_lockdep_assert_held_rcu_node(rnp); + if (!(rnp->expmask & mask_in)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } + mask = mask_in & rnp->expmask; WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask); for_each_leaf_node_cpu_mask(rnp, cpu, mask) { rdp = per_cpu_ptr(&rcu_data, cpu); @@ -257,8 +259,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { + unsigned long flags; + struct rcu_node *rnp = rdp->mynode; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); - rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); + ASSERT_EXCLUSIVE_WRITER(rdp->cpu_no_qs.b.exp); + rcu_report_exp_cpu_mult(rnp, flags, rdp->grpmask, true); } /* Common code for work-done checking. */ @@ -432,8 +439,10 @@ retry_ipi: raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); + if (mask_ofl_test) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_report_exp_cpu_mult(rnp, flags, mask_ofl_test, false); + } } static void rcu_exp_sel_wait_wake(unsigned long s); @@ -712,6 +721,18 @@ static void rcu_exp_sel_wait_wake(unsigned long s) rcu_exp_wait_wake(s); } +/* Request an expedited quiescent state. */ +static void rcu_exp_need_qs(void) +{ + lockdep_assert_irqs_disabled(); + ASSERT_EXCLUSIVE_WRITER_SCOPED(*this_cpu_ptr(&rcu_data.cpu_no_qs.b.exp)); + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + #ifdef CONFIG_PREEMPT_RCU /* @@ -730,24 +751,34 @@ static void rcu_exp_handler(void *unused) struct task_struct *t = current; /* - * First, the common case of not being in an RCU read-side + * First, is there no need for a quiescent state from this CPU, + * or is this CPU already looking for a quiescent state for the + * current grace period? If either is the case, just leave. + * However, this should not happen due to the preemptible + * sync_sched_exp_online_cleanup() implementation being a no-op, + * so warn if this does happen. + */ + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); + if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + READ_ONCE(rdp->cpu_no_qs.b.exp))) + return; + + /* + * Second, the common case of not being in an RCU read-side * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - rcu_is_cpu_rrupt_from_idle()) { + rcu_is_cpu_rrupt_from_idle()) rcu_report_exp_rdp(rdp); - } else { - WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + else + rcu_exp_need_qs(); return; } /* - * Second, the less-common case of being in an RCU read-side + * Third, the less-common case of being in an RCU read-side * critical section. In this case we can count on a future * rcu_read_unlock(). However, this rcu_read_unlock() might * execute on some other CPU, but in that case there will be @@ -768,7 +799,7 @@ static void rcu_exp_handler(void *unused) return; } - // Finally, negative nesting depth should not happen. + // Fourth and finally, negative nesting depth should not happen. WARN_ON_ONCE(1); } @@ -835,16 +866,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) #else /* #ifdef CONFIG_PREEMPT_RCU */ -/* Request an expedited quiescent state. */ -static void rcu_exp_need_qs(void) -{ - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); - /* Store .exp before .rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); -} - /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { @@ -852,6 +873,7 @@ static void rcu_exp_handler(void *unused) struct rcu_node *rnp = rdp->mynode; bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 2605dd234a13..5ff3bc56ff51 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1557,8 +1557,11 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) /* Dump out nocb kthread state for the specified rcu_data structure. */ static void show_rcu_nocb_state(struct rcu_data *rdp) { - char bufw[20]; - char bufr[20]; + char bufd[22]; + char bufw[45]; + char bufr[45]; + char bufn[22]; + char bufb[22]; struct rcu_data *nocb_next_rdp; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; @@ -1572,9 +1575,13 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) typeof(*rdp), nocb_entry_rdp); - sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); - sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); - pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", + sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]); + sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL], + rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]); + sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass)); + pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], @@ -1586,12 +1593,15 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) jiffies - rdp->nocb_nobypass_last, rdp->nocb_nobypass_count, ".D"[rcu_segcblist_ready_cbs(rsclp)], + rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd, ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn, ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], + !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb, rcu_segcblist_n_cbs(&rdp->cblist), rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1, diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3927ea5f7955..3c0bbbbb686f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -275,6 +275,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rcu_report_exp_rdp(rdp); else WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); } /* @@ -832,8 +833,17 @@ void rcu_read_unlock_strict(void) { struct rcu_data *rdp; - if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) return; + + /* + * rcu_report_qs_rdp() can only be invoked with a stable rdp and + * from the local CPU. + * + * The in_atomic_preempt_off() check ensures that we come here holding + * the last preempt_count (which will get dropped once we return to + * __rcu_read_unlock(). + */ rdp = this_cpu_ptr(&rcu_data); rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); @@ -974,13 +984,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) */ static void rcu_flavor_sched_clock_irq(int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && + (preempt_count() == HARDIRQ_OFFSET))) { /* * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. + * mode, from the idle loop without this being a nested + * interrupt, or while not holding the task preempt count + * (with PREEMPT_COUNT=y). In this case, the CPU is in a + * quiescent state, so note it. * * No memory barrier is required here because rcu_qs() * references only CPU-local variables that other CPUs @@ -1217,16 +1230,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + rcu_thread_affine_rnp(t, rnp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ } -static struct task_struct *rcu_boost_task(struct rcu_node *rnp) -{ - return READ_ONCE(rnp->boost_kthread_task); -} - #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1243,10 +1253,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { } -static struct task_struct *rcu_boost_task(struct rcu_node *rnp) -{ - return NULL; -} #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f8436969e0c8..c912b594ba98 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -527,12 +527,12 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) /* Get rcutorture access to sched_setaffinity(). */ -long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn) { int ret; ret = sched_setaffinity(pid, in_mask); - WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret); + WARN_ONCE(dowarn && ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret); return ret; } EXPORT_SYMBOL_GPL(torture_sched_setaffinity); diff --git a/kernel/reboot.c b/kernel/reboot.c index a701000bab34..41ab9e1ba357 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -704,6 +704,7 @@ void kernel_power_off(void) migrate_to_reboot_cpu(); syscore_shutdown(); pr_emerg("Power down\n"); + pr_flush(1000, true); kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_power_off(); } @@ -1287,7 +1288,7 @@ static struct attribute *reboot_attrs[] = { }; #ifdef CONFIG_SYSCTL -static struct ctl_table kern_reboot_table[] = { +static const struct ctl_table kern_reboot_table[] = { { .procname = "poweroff_cmd", .data = &poweroff_cmd, diff --git a/kernel/resource.c b/kernel/resource.c index b7c0e24d9398..12004452d999 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1683,8 +1683,7 @@ void __devm_release_region(struct device *dev, struct resource *parent, { struct region_devres match_data = { parent, start, n }; - __release_region(parent, start, n); - WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, + WARN_ON(devres_release(dev, devm_region_release, devm_region_match, &match_data)); } EXPORT_SYMBOL(__devm_release_region); diff --git a/kernel/rseq.c b/kernel/rseq.c index 9de6e35fe679..b7a1ec327e81 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -13,6 +13,7 @@ #include <linux/syscalls.h> #include <linux/rseq.h> #include <linux/types.h> +#include <linux/ratelimit.h> #include <asm/ptrace.h> #define CREATE_TRACE_POINTS @@ -25,6 +26,78 @@ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) +#ifdef CONFIG_DEBUG_RSEQ +static struct rseq *rseq_kernel_fields(struct task_struct *t) +{ + return (struct rseq *) t->rseq_fields; +} + +static int rseq_validate_ro_fields(struct task_struct *t) +{ + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + u32 cpu_id_start, cpu_id, node_id, mm_cid; + struct rseq __user *rseq = t->rseq; + + /* + * Validate fields which are required to be read-only by + * user-space. + */ + if (!user_read_access_begin(rseq, t->rseq_len)) + goto efault; + unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); + unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); + unsafe_get_user(node_id, &rseq->node_id, efault_end); + unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); + user_read_access_end(); + + if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || + cpu_id != rseq_kernel_fields(t)->cpu_id || + node_id != rseq_kernel_fields(t)->node_id || + mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { + + pr_warn("Detected rseq corruption for pid: %d, name: %s\n" + "\tcpu_id_start: %u ?= %u\n" + "\tcpu_id: %u ?= %u\n" + "\tnode_id: %u ?= %u\n" + "\tmm_cid: %u ?= %u\n", + t->pid, t->comm, + cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, + cpu_id, rseq_kernel_fields(t)->cpu_id, + node_id, rseq_kernel_fields(t)->node_id, + mm_cid, rseq_kernel_fields(t)->mm_cid); + } + + /* For now, only print a console warning on mismatch. */ + return 0; + +efault_end: + user_read_access_end(); +efault: + return -EFAULT; +} + +/* + * Update an rseq field and its in-kernel copy in lock-step to keep a coherent + * state. + */ +#define rseq_unsafe_put_user(t, value, field, error_label) \ + do { \ + unsafe_put_user(value, &t->rseq->field, error_label); \ + rseq_kernel_fields(t)->field = value; \ + } while (0) + +#else +static int rseq_validate_ro_fields(struct task_struct *t) +{ + return 0; +} + +#define rseq_unsafe_put_user(t, value, field, error_label) \ + unsafe_put_user(value, &t->rseq->field, error_label) +#endif + /* * * Restartable sequences are a lightweight interface that allows @@ -92,13 +165,20 @@ static int rseq_update_cpu_node_id(struct task_struct *t) u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); + /* + * Validate read-only rseq fields. + */ + if (rseq_validate_ro_fields(t)) + goto efault; WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; - unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); - unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_put_user(node_id, &rseq->node_id, efault_end); - unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); + + rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); + rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); + rseq_unsafe_put_user(t, node_id, node_id, efault_end); + rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); + /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if @@ -116,39 +196,69 @@ efault: static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { + struct rseq __user *rseq = t->rseq; u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; /* - * Reset cpu_id_start to its initial state (0). + * Validate read-only rseq fields. */ - if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) - return -EFAULT; - /* - * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming - * in after unregistration can figure out that rseq needs to be - * registered again. - */ - if (put_user(cpu_id, &t->rseq->cpu_id)) - return -EFAULT; - /* - * Reset node_id to its initial state (0). - */ - if (put_user(node_id, &t->rseq->node_id)) - return -EFAULT; + if (rseq_validate_ro_fields(t)) + goto efault; + + if (!user_write_access_begin(rseq, t->rseq_len)) + goto efault; + /* - * Reset mm_cid to its initial state (0). + * Reset all fields to their initial state. + * + * All fields have an initial state of 0 except cpu_id which is set to + * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after + * unregistration can figure out that rseq needs to be registered + * again. */ - if (put_user(mm_cid, &t->rseq->mm_cid)) - return -EFAULT; + rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); + rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); + rseq_unsafe_put_user(t, node_id, node_id, efault_end); + rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); + /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if * t->rseq_len != ORIG_RSEQ_SIZE. */ + user_write_access_end(); + return 0; + +efault_end: + user_write_access_end(); +efault: + return -EFAULT; +} + +/* + * Get the user-space pointer value stored in the 'rseq_cs' field. + */ +static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) +{ + if (!rseq_cs) + return -EFAULT; + +#ifdef CONFIG_64BIT + if (get_user(*rseq_cs, &rseq->rseq_cs)) + return -EFAULT; +#else + if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) + return -EFAULT; +#endif + return 0; } +/* + * If the rseq_cs field of 'struct rseq' contains a valid pointer to + * user-space, copy 'struct rseq_cs' from user-space and validate its fields. + */ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; @@ -157,17 +267,16 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) u32 sig; int ret; -#ifdef CONFIG_64BIT - if (get_user(ptr, &t->rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) - return -EFAULT; -#endif + ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); + if (ret) + return ret; + + /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } + /* Check that the pointer value fits in the user-space process space. */ if (ptr >= TASK_SIZE) return -EINVAL; urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; @@ -243,7 +352,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) return !!event_mask; } -static int clear_rseq_cs(struct task_struct *t) +static int clear_rseq_cs(struct rseq __user *rseq) { /* * The rseq_cs field is set to NULL on preemption or signal @@ -254,9 +363,9 @@ static int clear_rseq_cs(struct task_struct *t) * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT - return put_user(0UL, &t->rseq->rseq_cs); + return put_user(0UL, &rseq->rseq_cs); #else - if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) + if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) return -EFAULT; return 0; #endif @@ -288,11 +397,11 @@ static int rseq_ip_fixup(struct pt_regs *regs) * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t); + return clear_rseq_cs(t->rseq); ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) return ret; - ret = clear_rseq_cs(t); + ret = clear_rseq_cs(t->rseq); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, @@ -366,6 +475,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; + u64 rseq_cs; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) @@ -420,9 +530,38 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; + + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. + */ + if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) + return -EFAULT; + if (rseq_cs && clear_rseq_cs(rseq)) + return -EFAULT; + +#ifdef CONFIG_DEBUG_RSEQ + /* + * Initialize the in-kernel rseq fields copy for validation of + * read-only fields. + */ + if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || + get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || + get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || + get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) + return -EFAULT; +#endif + /* + * Activate the registration by setting the rseq area address, length + * and signature in the task struct. + */ current->rseq = rseq; current->rseq_len = rseq_len; current->rseq_sig = sig; + /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 976092b7bd45..8ae86371ddcd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -22,6 +22,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_build_policy.o += -DDISABLE_BRANCH_PROFILING +CFLAGS_build_utility.o += -DDISABLE_BRANCH_PROFILING +endif # # Build efficiency: # diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index db68a964e34e..2b331822c7e7 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -9,7 +9,7 @@ static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; #ifdef CONFIG_SYSCTL -static struct ctl_table sched_autogroup_sysctls[] = { +static const struct ctl_table sched_autogroup_sysctls[] = { { .procname = "sched_autogroup_enabled", .data = &sysctl_sched_autogroup_enabled, @@ -150,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p) * see this thread after that: we can no longer use signal->autogroup. * See the PF_EXITING check in task_wants_autogroup(). */ - sched_move_task(p); + sched_move_task(p, true); } static void @@ -182,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * sched_autogroup_exit_task(). */ for_each_thread(p, t) - sched_move_task(t); + sched_move_task(t, true); unlock_task_sighand(p, &flags); autogroup_kref_put(prev); diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index fae1f5c921eb..72d97aa8b726 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -61,6 +61,7 @@ #ifdef CONFIG_SCHED_CLASS_EXT # include "ext.c" +# include "ext_idle.c" #endif #include "syscalls.c" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 80a3df49ab47..bf9d8db94b70 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -68,9 +68,7 @@ # include "cpufreq_schedutil.c" #endif -#ifdef CONFIG_SCHED_DEBUG -# include "debug.c" -#endif +#include "debug.c" #ifdef CONFIG_SCHEDSTATS # include "stats.c" diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e5a6bf587f9..87540217fc09 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,7 +91,6 @@ #include "autogroup.h" #include "pelt.h" #include "smp.h" -#include "stats.h" #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" @@ -119,7 +118,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits * @@ -129,7 +127,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); */ #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | -const_debug unsigned int sysctl_sched_features = +__read_mostly unsigned int sysctl_sched_features = #include "features.h" 0; #undef SCHED_FEAT @@ -143,13 +141,12 @@ const_debug unsigned int sysctl_sched_features = */ __read_mostly int sysctl_resched_latency_warn_ms = 100; __read_mostly int sysctl_resched_latency_warn_once = 1; -#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ -const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; +__read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; __read_mostly int scheduler_running; @@ -740,39 +737,43 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) s64 __maybe_unused steal = 0, irq_delta = 0; #ifdef CONFIG_IRQ_TIME_ACCOUNTING - irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + if (irqtime_enabled()) { + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; - /* - * Since irq_time is only updated on {soft,}irq_exit, we might run into - * this case when a previous update_rq_clock() happened inside a - * {soft,}IRQ region. - * - * When this happens, we stop ->clock_task and only update the - * prev_irq_time stamp to account for the part that fit, so that a next - * update will consume the rest. This ensures ->clock_task is - * monotonic. - * - * It does however cause some slight miss-attribution of {soft,}IRQ - * time, a more accurate solution would be to update the irq_time using - * the current rq->clock timestamp, except that would require using - * atomic ops. - */ - if (irq_delta > delta) - irq_delta = delta; + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}IRQ region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}IRQ + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; - rq->prev_irq_time += irq_delta; - delta -= irq_delta; - delayacct_irq(rq->curr, irq_delta); + rq->prev_irq_time += irq_delta; + delta -= irq_delta; + delayacct_irq(rq->curr, irq_delta); + } #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { - steal = paravirt_steal_clock(cpu_of(rq)); + u64 prev_steal; + + steal = prev_steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - rq->prev_steal_time_rq += steal; + rq->prev_steal_time_rq = prev_steal; delta -= steal; } #endif @@ -789,22 +790,25 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) void update_rq_clock(struct rq *rq) { s64 delta; + u64 clock; lockdep_assert_rq_held(rq); if (rq->clock_update_flags & RQCF_ACT_SKIP) return; -#ifdef CONFIG_SCHED_DEBUG if (sched_feat(WARN_DOUBLE_CLOCK)) - SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); + WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED); rq->clock_update_flags |= RQCF_UPDATED; -#endif - delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + clock = sched_clock_cpu(cpu_of(rq)); + scx_rq_clock_update(rq, clock); + + delta = clock - rq->clock; if (delta < 0) return; rq->clock += delta; + update_rq_clock_task(rq, delta); } @@ -908,8 +912,7 @@ static void hrtick_rq_init(struct rq *rq) #ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - rq->hrtick_timer.function = hrtick; + hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) @@ -1055,9 +1058,10 @@ void wake_up_q(struct wake_q_head *head) struct task_struct *task; task = container_of(node, struct task_struct, wake_q); - /* Task can safely be re-inserted now: */ node = node->next; - task->wake_q.next = NULL; + /* pairs with cmpxchg_relaxed() in __wake_q_add() */ + WRITE_ONCE(task->wake_q.next, NULL); + /* Task can safely be re-inserted now. */ /* * wake_up_process() executes a full barrier, which pairs with @@ -1168,13 +1172,13 @@ int get_nohz_timer_target(void) struct sched_domain *sd; const struct cpumask *hk_mask; - if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; } - hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); guard(rcu)(); @@ -1189,7 +1193,7 @@ int get_nohz_timer_target(void) } if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); + default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE); return default_cpu; } @@ -1341,7 +1345,7 @@ bool sched_can_stop_tick(struct rq *rq) if (scx_enabled() && !scx_can_stop_tick(rq)) return false; - if (rq->cfs.h_nr_running > 1) + if (rq->cfs.h_nr_queued > 1) return false; /* @@ -1711,7 +1715,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, bucket = &uc_rq->bucket[uc_se->bucket_id]; - SCHED_WARN_ON(!bucket->tasks); + WARN_ON_ONCE(!bucket->tasks); if (likely(bucket->tasks)) bucket->tasks--; @@ -1731,7 +1735,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, * Defensive programming: this should never happen. If it happens, * e.g. due to future modification, warn and fix up the expected value. */ - SCHED_WARN_ON(bucket->value > rq_clamp); + WARN_ON_ONCE(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) { bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); uclamp_rq_set(rq, clamp_id, bkt_clamp); @@ -1748,7 +1752,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled. */ - if (!static_branch_unlikely(&sched_uclamp_used)) + if (!uclamp_is_used()) return; if (unlikely(!p->sched_class->uclamp_enabled)) @@ -1775,7 +1779,7 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) * The condition is constructed such that a NOP is generated when * sched_uclamp_used is disabled. */ - if (!static_branch_unlikely(&sched_uclamp_used)) + if (!uclamp_is_used()) return; if (unlikely(!p->sched_class->uclamp_enabled)) @@ -1933,12 +1937,12 @@ static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write, } if (update_root_tg) { - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); uclamp_update_root_tg(); } if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); uclamp_sync_util_min_rt_default(); } @@ -2113,7 +2117,7 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - SCHED_WARN_ON(flags & DEQUEUE_SLEEP); + WARN_ON_ONCE(flags & DEQUEUE_SLEEP); WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ASSERT_EXCLUSIVE_WRITER(p->on_rq); @@ -2718,7 +2722,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) * XXX do further audits, this smells like something putrid. */ if (ctx->flags & SCA_MIGRATE_DISABLE) - SCHED_WARN_ON(!p->on_cpu); + WARN_ON_ONCE(!p->on_cpu); else lockdep_assert_held(&p->pi_lock); @@ -3283,7 +3287,6 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { -#ifdef CONFIG_SCHED_DEBUG unsigned int state = READ_ONCE(p->__state); /* @@ -3321,7 +3324,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(!cpu_online(new_cpu)); WARN_ON_ONCE(is_migration_disabled(p)); -#endif trace_sched_migrate_task(p, new_cpu); @@ -3534,7 +3536,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * * More yuck to audit. */ - do_set_cpus_allowed(p, task_cpu_possible_mask(p)); + do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); state = fail; break; case fail: @@ -3913,13 +3915,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { - /* - * The BPF scheduler may depend on select_task_rq() being invoked during - * wakeups. In addition, @p may end up executing on a different CPU - * regardless of what happens in the wakeup path making the ttwu_queue - * optimization less meaningful. Skip if on SCX. - */ - if (task_on_scx(p)) + /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ + if (!scx_allow_ttwu_queue(p)) return false; /* @@ -4187,7 +4184,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); if (!ttwu_state_match(p, state, &success)) goto out; @@ -4481,7 +4478,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_LIST_HEAD(&p->se.group_node); /* A delayed task cannot be in clone(). */ - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -4646,7 +4643,7 @@ static int sysctl_schedstats(const struct ctl_table *table, int write, void *buf #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SYSCTL -static struct ctl_table sched_core_sysctls[] = { +static const struct ctl_table sched_core_sysctls[] = { #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", @@ -5569,7 +5566,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_SCHED_DEBUG static u64 cpu_resched_latency(struct rq *rq) { int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); @@ -5614,9 +5610,6 @@ static int __init setup_resched_latency_warn_ms(char *str) return 1; } __setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); -#else -static inline u64 cpu_resched_latency(struct rq *rq) { return 0; } -#endif /* CONFIG_SCHED_DEBUG */ /* * This function gets called by the timer code, with HZ frequency. @@ -5632,7 +5625,7 @@ void sched_tick(void) unsigned long hw_pressure; u64 resched_latency; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) arch_scale_freq_tick(); sched_clock_tick(); @@ -5737,7 +5730,7 @@ static void sched_tick_remote(struct work_struct *work) * we are always sure that there is no proxy (only a * single task is running). */ - SCHED_WARN_ON(rq->curr != rq->donor); + WARN_ON_ONCE(rq->curr != rq->donor); update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -5771,7 +5764,7 @@ static void sched_tick_start(int cpu) int os; struct tick_work *twork; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5792,7 +5785,7 @@ static void sched_tick_stop(int cpu) struct tick_work *twork; int os; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5957,7 +5950,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); - SCHED_WARN_ON(ct_state() == CT_STATE_USER); + WARN_ON_ONCE(ct_state() == CT_STATE_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -6018,7 +6011,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * opportunity to pull in more work from other CPUs. */ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && - rq->nr_running == rq->cfs.h_nr_running)) { + rq->nr_running == rq->cfs.h_nr_queued)) { p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) @@ -6641,7 +6634,6 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; - bool block = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -6702,7 +6694,7 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - block = try_to_block_task(rq, prev, prev_state); + try_to_block_task(rq, prev, prev_state); switch_count = &prev->nvcsw; } @@ -6711,9 +6703,7 @@ static void __sched notrace __schedule(int sched_mode) picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); -#ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; -#endif if (likely(prev != next)) { rq->nr_switches++; @@ -6748,7 +6738,8 @@ picked: migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); - psi_sched_switch(prev, next, block); + psi_sched_switch(prev, next, !task_on_rq_queued(prev) || + prev->se.sched_delayed); trace_sched_switch(preempt, prev, next, prev_state); @@ -6803,7 +6794,7 @@ static inline void sched_submit_work(struct task_struct *tsk) * deadlock if the callback attempts to acquire a lock which is * already acquired. */ - SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); + WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT); /* * If we are going to sleep and we have plugged IO queued, @@ -7086,7 +7077,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); + WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -7276,12 +7267,12 @@ out_unlock: #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) { - if (should_resched(0)) { + if (should_resched(0) && !irqs_disabled()) { preempt_schedule_common(); return 1; } /* - * In preemptible kernels, ->rcu_read_lock_nesting tells the tick + * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick * whether the current CPU is in an RCU read-side critical section, * so the tick can report quiescent states even for CPUs looping * in kernel context. In contrast, in non-preemptible kernels, @@ -7290,6 +7281,8 @@ int __sched __cond_resched(void) * RCU quiescent state. Therefore, the following code causes * cond_resched() to report a quiescent state, but only when RCU * is in urgent need of one. + * A third case, preemptible, but non-PREEMPT_RCU provides for + * urgently needed quiescent states via rcu_flavor_sched_clock_irq(). */ #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); @@ -7638,10 +7631,57 @@ PREEMPT_MODEL_ACCESSOR(lazy); #else /* !CONFIG_PREEMPT_DYNAMIC: */ +#define preempt_dynamic_mode -1 + static inline void preempt_dynamic_init(void) { } #endif /* CONFIG_PREEMPT_DYNAMIC */ +const char *preempt_modes[] = { + "none", "voluntary", "full", "lazy", NULL, +}; + +const char *preempt_model_str(void) +{ + bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) && + (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) || + IS_ENABLED(CONFIG_PREEMPT_LAZY)); + static char buf[128]; + + if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) { + struct seq_buf s; + + seq_buf_init(&s, buf, sizeof(buf)); + seq_buf_puts(&s, "PREEMPT"); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + seq_buf_printf(&s, "%sRT%s", + brace ? "_{" : "_", + brace ? "," : ""); + + if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { + seq_buf_printf(&s, "(%s)%s", + preempt_dynamic_mode > 0 ? + preempt_modes[preempt_dynamic_mode] : "undef", + brace ? "}" : ""); + return seq_buf_str(&s); + } + + if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { + seq_buf_printf(&s, "LAZY%s", + brace ? "}" : ""); + return seq_buf_str(&s); + } + + return seq_buf_str(&s); + } + + if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD)) + return "VOLUNTARY"; + + return "NONE"; +} + int io_schedule_prepare(void) { int old_iowait = current->in_iowait; @@ -7701,9 +7741,9 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n", + pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d task_flags:0x%04x flags:0x%08lx\n", free, task_pid_nr(p), task_tgid_nr(p), - ppid, read_task_thread_flags(p)); + ppid, p->flags, read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); @@ -7756,10 +7796,9 @@ void show_state_filter(unsigned int state_filter) sched_show_task(p); } -#ifdef CONFIG_SCHED_DEBUG if (!state_filter) sysrq_sched_debug_show(); -#endif + rcu_read_unlock(); /* * Only show locks if all tasks are dumped: @@ -7930,19 +7969,26 @@ void sched_setnuma(struct task_struct *p, int nid) #ifdef CONFIG_HOTPLUG_CPU /* - * Ensure that the idle task is using init_mm right before its CPU goes - * offline. + * Invoked on the outgoing CPU in context of the CPU hotplug thread + * after ensuring that there are no user space tasks left on the CPU. + * + * If there is a lazy mm in use on the hotplug thread, drop it and + * switch to init_mm. + * + * The reference count on init_mm is dropped in finish_cpu(). */ -void idle_task_exit(void) +static void sched_force_init_mm(void) { struct mm_struct *mm = current->active_mm; - BUG_ON(cpu_online(smp_processor_id())); - BUG_ON(current != this_rq()->idle); - if (mm != &init_mm) { - switch_mm(mm, &init_mm, current); + mmgrab_lazy_tlb(&init_mm); + local_irq_disable(); + current->active_mm = &init_mm; + switch_mm_irqs_off(mm, &init_mm, current); + local_irq_enable(); finish_arch_post_lock_switch(); + mmdrop_lazy_tlb(mm); } /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ @@ -8167,7 +8213,7 @@ static void cpuset_cpu_active(void) * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - partition_sched_domains(1, NULL, NULL); + cpuset_reset_sched_domains(); if (--num_cpus_frozen) return; /* @@ -8180,19 +8226,14 @@ static void cpuset_cpu_active(void) cpuset_update_active_cpus(); } -static int cpuset_cpu_inactive(unsigned int cpu) +static void cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - int ret = dl_bw_check_overflow(cpu); - - if (ret) - return ret; cpuset_update_active_cpus(); } else { num_cpus_frozen++; - partition_sched_domains(1, NULL, NULL); + cpuset_reset_sched_domains(); } - return 0; } static inline void sched_smt_present_inc(int cpu) @@ -8254,6 +8295,11 @@ int sched_cpu_deactivate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); int ret; + ret = dl_bw_deactivate(cpu); + + if (ret) + return ret; + /* * Remove CPU from nohz.idle_cpus_mask to prevent participating in * load balancing when not active @@ -8299,15 +8345,7 @@ int sched_cpu_deactivate(unsigned int cpu) return 0; sched_update_numa(cpu, false); - ret = cpuset_cpu_inactive(cpu); - if (ret) { - sched_smt_present_inc(cpu); - sched_set_rq_online(rq, cpu); - balance_push_set(cpu, false); - set_cpu_active(cpu, true); - sched_update_numa(cpu, true); - return ret; - } + cpuset_cpu_inactive(cpu); sched_domains_numa_masks_clear(cpu); return 0; } @@ -8344,6 +8382,7 @@ int sched_cpu_starting(unsigned int cpu) int sched_cpu_wait_empty(unsigned int cpu) { balance_hotplug_wait(); + sched_force_init_mm(); return 0; } @@ -8415,9 +8454,9 @@ void __init sched_init_smp(void) * CPU masks are stable and all blatant races in the below code cannot * happen. */ - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); sched_init_domains(cpu_active_mask); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0) @@ -9007,7 +9046,7 @@ void sched_release_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -static struct task_group *sched_get_task_group(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk) { struct task_group *tg; @@ -9019,13 +9058,7 @@ static struct task_group *sched_get_task_group(struct task_struct *tsk) tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group, css); tg = autogroup_task_group(tsk, tg); - - return tg; -} - -static void sched_change_group(struct task_struct *tsk, struct task_group *group) -{ - tsk->sched_task_group = group; + tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) @@ -9042,24 +9075,15 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect * its new group. */ -void sched_move_task(struct task_struct *tsk) +void sched_move_task(struct task_struct *tsk, bool for_autogroup) { int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; - struct task_group *group; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - /* - * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous - * group changes. - */ - group = sched_get_task_group(tsk); - if (group == tsk->sched_task_group) - return; - update_rq_clock(rq); running = task_current_donor(rq, tsk); @@ -9070,8 +9094,9 @@ void sched_move_task(struct task_struct *tsk) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk, group); - scx_move_task(tsk); + sched_change_group(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -9172,7 +9197,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) - sched_move_task(task); + sched_move_task(task, false); scx_cgroup_finish_attach(); } @@ -9193,7 +9218,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) unsigned int clamps; lockdep_assert_held(&uclamp_mutex); - SCHED_WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent @@ -9285,7 +9310,7 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, if (req.ret) return req.ret; - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); guard(mutex)(&uclamp_mutex); guard(rcu)(); @@ -10528,7 +10553,7 @@ static void task_mm_cid_work(struct callback_head *work) struct mm_struct *mm; int weight, cpu; - SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work)); + WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); work->next = work; /* Prevent double-add */ if (t->flags & PF_EXITING) @@ -10590,7 +10615,7 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) return; /* No page allocation under rq lock */ - task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC); + task_work_add(curr, work, TWA_RESUME); } void sched_mm_cid_exit_signals(struct task_struct *t) diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 1ef98a93eb1d..c4606ca89210 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -65,7 +65,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, * a cookie until after we've removed it, we must have core scheduling * enabled here. */ - SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq)); + WARN_ON_ONCE((p->core_cookie || cookie) && !sched_core_enabled(rq)); if (sched_core_enqueued(p)) sched_core_dequeue(rq, p, DEQUEUE_SAVE); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 28c77904ea74..1a19d69b91ed 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -83,7 +83,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (unlikely(sg_policy->limits_changed)) { sg_policy->limits_changed = false; - sg_policy->need_freq_update = true; + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); return true; } @@ -96,7 +96,7 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { if (sg_policy->need_freq_update) - sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + sg_policy->need_freq_update = false; else if (sg_policy->next_freq == next_freq) return false; @@ -604,31 +604,6 @@ static const struct kobj_type sugov_tunables_ktype = { /********************** cpufreq governor interface *********************/ -#ifdef CONFIG_ENERGY_MODEL -static void rebuild_sd_workfn(struct work_struct *work) -{ - rebuild_sched_domains_energy(); -} - -static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); - -/* - * EAS shouldn't be attempted without sugov, so rebuild the sched_domains - * on governor changes to make sure the scheduler knows about it. - */ -static void sugov_eas_rebuild_sd(void) -{ - /* - * When called from the cpufreq_register_driver() path, the - * cpu_hotplug_lock is already held, so use a work item to - * avoid nested locking in rebuild_sched_domains(). - */ - schedule_work(&rebuild_sd_work); -} -#else -static inline void sugov_eas_rebuild_sd(void) { }; -#endif - struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) @@ -691,7 +666,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - kthread_bind_mask(thread, policy->related_cpus); + if (policy->dvfs_possible_from_any_cpu) + set_cpus_allowed_ptr(thread, policy->related_cpus); + else + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); @@ -784,7 +763,11 @@ static int sugov_init(struct cpufreq_policy *policy) goto fail; out: - sugov_eas_rebuild_sd(); + /* + * Schedutil is the preferred governor for EAS, so rebuild sched domains + * on governor changes to make sure the scheduler knows about them. + */ + em_rebuild_sched_domains(); mutex_unlock(&global_tunables_lock); return 0; @@ -826,7 +809,7 @@ static void sugov_exit(struct cpufreq_policy *policy) sugov_policy_free(sg_policy); cpufreq_disable_fast_switch(policy); - sugov_eas_rebuild_sd(); + em_rebuild_sched_domains(); } static int sugov_start(struct cpufreq_policy *policy) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 0bed0fa1acd9..6dab4854c6c0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -22,7 +22,7 @@ */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); -static int sched_clock_irqtime; +int sched_clock_irqtime; void enable_sched_clock_irqtime(void) { @@ -57,7 +57,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset) s64 delta; int cpu; - if (!sched_clock_irqtime) + if (!irqtime_enabled()) return; cpu = smp_processor_id(); @@ -90,8 +90,6 @@ static u64 irqtime_tick_accounted(u64 maxtime) #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -#define sched_clock_irqtime (0) - static u64 irqtime_tick_accounted(u64 dummy) { return 0; @@ -478,7 +476,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (vtime_accounting_enabled_this_cpu()) return; - if (sched_clock_irqtime) { + if (irqtime_enabled()) { irqtime_account_process_tick(p, user_tick, 1); return; } @@ -507,7 +505,7 @@ void account_idle_ticks(unsigned long ticks) { u64 cputime, steal; - if (sched_clock_irqtime) { + if (irqtime_enabled()) { irqtime_account_idle_ticks(ticks); return; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d94f2ed6d1f4..ad45a8fea245 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -26,7 +26,7 @@ static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ #ifdef CONFIG_SYSCTL -static struct ctl_table sched_dl_sysctls[] = { +static const struct ctl_table sched_dl_sysctls[] = { { .procname = "sched_deadline_period_max_us", .data = &sysctl_sched_dl_period_max, @@ -166,14 +166,14 @@ static inline unsigned long dl_bw_capacity(int i) } } -static inline bool dl_bw_visited(int cpu, u64 gen) +bool dl_bw_visited(int cpu, u64 cookie) { struct root_domain *rd = cpu_rq(cpu)->rd; - if (rd->visit_gen == gen) + if (rd->visit_cookie == cookie) return true; - rd->visit_gen = gen; + rd->visit_cookie = cookie; return false; } @@ -207,7 +207,7 @@ static inline unsigned long dl_bw_capacity(int i) return SCHED_CAPACITY_SCALE; } -static inline bool dl_bw_visited(int cpu, u64 gen) +bool dl_bw_visited(int cpu, u64 cookie) { return false; } @@ -249,8 +249,8 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw += dl_bw; - SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + WARN_ON_ONCE(dl_rq->running_bw < old); /* overflow */ + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); /* kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq_of_dl_rq(dl_rq), 0); } @@ -262,7 +262,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->running_bw -= dl_bw; - SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ + WARN_ON_ONCE(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) dl_rq->running_bw = 0; /* kick cpufreq (see the comment in kernel/sched/sched.h). */ @@ -276,7 +276,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw += dl_bw; - SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */ + WARN_ON_ONCE(dl_rq->this_bw < old); /* overflow */ } static inline @@ -286,10 +286,10 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) lockdep_assert_rq_held(rq_of_dl_rq(dl_rq)); dl_rq->this_bw -= dl_bw; - SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */ + WARN_ON_ONCE(dl_rq->this_bw > old); /* underflow */ if (dl_rq->this_bw > old) dl_rq->this_bw = 0; - SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw); } static inline @@ -342,6 +342,29 @@ static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_s __add_rq_bw(new_bw, &rq->dl); } +static __always_inline +void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer) +{ + /* + * If the timer callback was running (hrtimer_try_to_cancel == -1), + * it will eventually call put_task_struct(). + */ + if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); +} + +static __always_inline +void cancel_replenish_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->dl_timer); +} + +static __always_inline +void cancel_inactive_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->inactive_timer); +} + static void dl_change_utilization(struct task_struct *p, u64 new_bw) { WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); @@ -495,10 +518,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { - if (!dl_server(dl_se)) - put_task_struct(dl_task_of(dl_se)); - } + cancel_inactive_timer(dl_se); } else { /* * Since "dl_non_contending" is not set, the @@ -1362,8 +1382,7 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = dl_task_timer; + hrtimer_setup(timer, dl_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } /* @@ -1819,8 +1838,7 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->inactive_timer; - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - timer->function = inactive_task_timer; + hrtimer_setup(timer, inactive_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #define __node_2_dle(node) \ @@ -2115,13 +2133,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * The replenish timer needs to be canceled. No * problem if it fires concurrently: boosted threads * are ignored in dl_task_timer(). - * - * If the timer callback was running (hrtimer_try_to_cancel == -1), - * it will eventually call put_task_struct(). */ - if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 && - !dl_server(&p->dl)) - put_task_struct(p); + cancel_replenish_timer(&p->dl); p->dl.dl_throttled = 0; } } else if (!dl_prio(p->normal_prio)) { @@ -2289,8 +2302,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); } sub_rq_bw(&p->dl, &rq->dl); rq_unlock(rq, &rf); @@ -2506,16 +2518,13 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu return NULL; next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); - -next_node: - if (next_node) { + while (next_node) { p = __node_2_pdl(next_node); if (task_is_pushable(rq, p, cpu)) return p; next_node = rb_next(next_node); - goto next_node; } return NULL; @@ -2945,7 +2954,7 @@ void dl_add_task_root_domain(struct task_struct *p) struct dl_bw *dl_b; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); - if (!dl_task(p)) { + if (!dl_task(p) || dl_entity_is_special(&p->dl)) { raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return; } @@ -2964,11 +2973,26 @@ void dl_add_task_root_domain(struct task_struct *p) void dl_clear_root_domain(struct root_domain *rd) { - unsigned long flags; + int i; - raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); rd->dl_bw.total_bw = 0; - raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); + + /* + * dl_servers are not tasks. Since dl_add_task_root_domain ignores + * them, we need to account for them here explicitly. + */ + for_each_cpu(i, rd->span) { + struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; + + if (dl_server(dl_se) && cpu_active(i)) + __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i)); + } +} + +void dl_clear_root_domain_cpu(int cpu) +{ + dl_clear_root_domain(cpu_rq(cpu)->rd); } #endif /* CONFIG_SMP */ @@ -3029,8 +3053,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); /* * In case a task is setscheduled to SCHED_DEADLINE we need to keep @@ -3150,15 +3173,18 @@ DEFINE_SCHED_CLASS(dl) = { #endif }; -/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ -static u64 dl_generation; +/* + * Used for dl_bw check and update, used under sched_rt_handler()::mutex and + * sched_domains_mutex. + */ +u64 dl_cookie; int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); - u64 gen = ++dl_generation; + u64 cookie = ++dl_cookie; struct dl_bw *dl_b; int cpu, cpus, ret = 0; unsigned long flags; @@ -3168,10 +3194,10 @@ int sched_dl_global_validate(void) * value smaller than the currently allocated bandwidth in * any of the root_domains. */ - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { rcu_read_lock_sched(); - if (dl_bw_visited(cpu, gen)) + if (dl_bw_visited(cpu, cookie)) goto next; dl_b = dl_bw_of(cpu); @@ -3208,7 +3234,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) void sched_dl_do_global(void) { u64 new_bw = -1; - u64 gen = ++dl_generation; + u64 cookie = ++dl_cookie; struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -3219,7 +3245,7 @@ void sched_dl_do_global(void) for_each_possible_cpu(cpu) { rcu_read_lock_sched(); - if (dl_bw_visited(cpu, gen)) { + if (dl_bw_visited(cpu, cookie)) { rcu_read_unlock_sched(); continue; } @@ -3453,29 +3479,31 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, } enum dl_bw_request { - dl_bw_req_check_overflow = 0, + dl_bw_req_deactivate = 0, dl_bw_req_alloc, dl_bw_req_free }; static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow = 0; + u64 fair_server_bw = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (req == dl_bw_req_free) { + cap = dl_bw_capacity(cpu); + switch (req) { + case dl_bw_req_free: __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); - } else { - unsigned long cap = dl_bw_capacity(cpu); - + break; + case dl_bw_req_alloc: overflow = __dl_overflow(dl_b, cap, 0, dl_bw); - if (req == dl_bw_req_alloc && !overflow) { + if (!overflow) { /* * We reserve space in the destination * root_domain, as we can't fail after this point. @@ -3484,6 +3512,42 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) */ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); } + break; + case dl_bw_req_deactivate: + /* + * cpu is not off yet, but we need to do the math by + * considering it off already (i.e., what would happen if we + * turn cpu off?). + */ + cap -= arch_scale_cpu_capacity(cpu); + + /* + * cpu is going offline and NORMAL tasks will be moved away + * from it. We can thus discount dl_server bandwidth + * contribution as it won't need to be servicing tasks after + * the cpu is off. + */ + if (cpu_rq(cpu)->fair_server.dl_server) + fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw; + + /* + * Not much to check if no DEADLINE bandwidth is present. + * dl_servers we can discount, as tasks will be moved out the + * offlined CPUs anyway. + */ + if (dl_b->total_bw - fair_server_bw > 0) { + /* + * Leaving at least one CPU for DEADLINE tasks seems a + * wise thing to do. As said above, cpu is not offline + * yet, so account for that. + */ + if (dl_bw_cpus(cpu) - 1) + overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0); + else + overflow = 1; + } + + break; } raw_spin_unlock_irqrestore(&dl_b->lock, flags); @@ -3492,9 +3556,9 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) return overflow ? -EBUSY : 0; } -int dl_bw_check_overflow(int cpu) +int dl_bw_deactivate(int cpu) { - return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0); + return dl_bw_manage(dl_bw_req_deactivate, cpu, 0); } int dl_bw_alloc(int cpu, u64 dl_bw) @@ -3508,9 +3572,7 @@ void dl_bw_free(int cpu, u64 dl_bw) } #endif -#ifdef CONFIG_SCHED_DEBUG void print_dl_stats(struct seq_file *m, int cpu) { print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); } -#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a1be00a988bf..56ae54e0ce6a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -244,11 +244,13 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, static int sched_dynamic_show(struct seq_file *m, void *v) { - static const char * preempt_modes[] = { - "none", "voluntary", "full", "lazy", - }; - int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; + int j; + + /* Count entries in NULL terminated preempt_modes */ + for (j = 0; preempt_modes[j]; j++) + ; + j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); for (; i < j; i++) { if (preempt_dynamic_mode == i) @@ -292,7 +294,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, bool orig; cpus_read_lock(); - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); orig = sched_debug_verbose; result = debugfs_write_file_bool(filp, ubuf, cnt, ppos); @@ -304,7 +306,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, sd_dentry = NULL; } - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); cpus_read_unlock(); return result; @@ -379,7 +381,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return -EINVAL; } - if (rq->cfs.h_nr_running) { + if (rq->cfs.h_nr_queued) { update_rq_clock(rq); dl_server_stop(&rq->fair_server); } @@ -392,7 +394,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", cpu_of(rq)); - if (rq->cfs.h_nr_running) + if (rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); } @@ -515,9 +517,9 @@ static __init int sched_init_debug(void) debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); update_sched_domain_debugfs(); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); #endif #ifdef CONFIG_NUMA_BALANCING @@ -843,13 +845,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(right_vruntime)); spread = right_vruntime - left_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); - SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); - SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed); - SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", - cfs_rq->idle_nr_running); - SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", - cfs_rq->idle_h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", @@ -1265,6 +1264,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, if (task_has_dl_policy(p)) { P(dl.runtime); P(dl.deadline); + } else if (fair_policy(p->policy)) { + P(se.slice); } #ifdef CONFIG_SCHED_CLASS_EXT __PS("ext.enabled", task_on_scx(p)); @@ -1295,8 +1296,10 @@ void resched_latency_warn(int cpu, u64 latency) { static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1); - WARN(__ratelimit(&latency_check_ratelimit), - "sched: CPU %d need_resched set for > %llu ns (%d ticks) " - "without schedule\n", - cpu, latency, cpu_rq(cpu)->ticks_without_resched); + if (likely(!__ratelimit(&latency_check_ratelimit))) + return; + + pr_err("sched: CPU %d need_resched set for > %llu ns (%d ticks) without schedule\n", + cpu, latency, cpu_rq(cpu)->ticks_without_resched); + dump_stack(); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 19813b387ef9..21575d39c376 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6,6 +6,9 @@ * Copyright (c) 2022 Tejun Heo <tj@kernel.org> * Copyright (c) 2022 David Vernet <dvernet@meta.com> */ +#include <linux/btf_ids.h> +#include "ext_idle.h" + #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) enum scx_consts { @@ -93,7 +96,7 @@ enum scx_ops_flags { /* * Keep built-in idle tracking even if ops.update_idle() is implemented. */ - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, /* * By default, if there are no other task to run on the CPU, ext core @@ -101,7 +104,7 @@ enum scx_ops_flags { * flag is specified, such tasks are passed to ops.enqueue() with * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. */ - SCX_OPS_ENQ_LAST = 1LLU << 1, + SCX_OPS_ENQ_LAST = 1LLU << 1, /* * An exiting task may schedule after PF_EXITING is set. In such cases, @@ -114,13 +117,48 @@ enum scx_ops_flags { * depend on pid lookups and wants to handle these tasks directly, the * following flag can be used. */ - SCX_OPS_ENQ_EXITING = 1LLU << 2, + SCX_OPS_ENQ_EXITING = 1LLU << 2, /* * If set, only tasks with policy set to SCHED_EXT are attached to * sched_ext. If clear, SCHED_NORMAL tasks are also included. */ - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + + /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes + * ops.enqueue() on the ops.select_cpu() selected or the wakee's + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline + * transfers. When this optimization is enabled, ops.select_cpu() is + * skipped in some cases (when racing against the wakee switching out). + * As the BPF scheduler may depend on ops.select_cpu() being invoked + * during wakeups, queued wakeup is disabled by default. + * + * If this ops flag is set, queued wakeup optimization is enabled and + * the BPF scheduler must be able to handle ops.enqueue() invoked on the + * wakee's CPU without preceding ops.select_cpu() even for tasks which + * may be executed on multiple CPUs. + */ + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, + + /* + * If set, enable per-node idle cpumasks. If clear, use a single global + * flat idle cpumask. + */ + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, /* * CPU cgroup support flags @@ -130,7 +168,10 @@ enum scx_ops_flags { SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | SCX_OPS_HAS_CGROUP_WEIGHT, }; @@ -206,7 +247,7 @@ struct scx_dump_ctx { */ struct sched_ext_ops { /** - * select_cpu - Pick the target CPU for a task which is being woken up + * @select_cpu: Pick the target CPU for a task which is being woken up * @p: task being woken up * @prev_cpu: the cpu @p was on before sleeping * @wake_flags: SCX_WAKE_* @@ -233,7 +274,7 @@ struct sched_ext_ops { s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); /** - * enqueue - Enqueue a task on the BPF scheduler + * @enqueue: Enqueue a task on the BPF scheduler * @p: task being enqueued * @enq_flags: %SCX_ENQ_* * @@ -248,7 +289,7 @@ struct sched_ext_ops { void (*enqueue)(struct task_struct *p, u64 enq_flags); /** - * dequeue - Remove a task from the BPF scheduler + * @dequeue: Remove a task from the BPF scheduler * @p: task being dequeued * @deq_flags: %SCX_DEQ_* * @@ -264,7 +305,7 @@ struct sched_ext_ops { void (*dequeue)(struct task_struct *p, u64 deq_flags); /** - * dispatch - Dispatch tasks from the BPF scheduler and/or user DSQs + * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs * @cpu: CPU to dispatch tasks for * @prev: previous task being switched out * @@ -287,7 +328,7 @@ struct sched_ext_ops { void (*dispatch)(s32 cpu, struct task_struct *prev); /** - * tick - Periodic tick + * @tick: Periodic tick * @p: task running currently * * This operation is called every 1/HZ seconds on CPUs which are @@ -297,7 +338,7 @@ struct sched_ext_ops { void (*tick)(struct task_struct *p); /** - * runnable - A task is becoming runnable on its associated CPU + * @runnable: A task is becoming runnable on its associated CPU * @p: task becoming runnable * @enq_flags: %SCX_ENQ_* * @@ -324,7 +365,7 @@ struct sched_ext_ops { void (*runnable)(struct task_struct *p, u64 enq_flags); /** - * running - A task is starting to run on its associated CPU + * @running: A task is starting to run on its associated CPU * @p: task starting to run * * See ->runnable() for explanation on the task state notifiers. @@ -332,7 +373,7 @@ struct sched_ext_ops { void (*running)(struct task_struct *p); /** - * stopping - A task is stopping execution + * @stopping: A task is stopping execution * @p: task stopping to run * @runnable: is task @p still runnable? * @@ -343,7 +384,7 @@ struct sched_ext_ops { void (*stopping)(struct task_struct *p, bool runnable); /** - * quiescent - A task is becoming not runnable on its associated CPU + * @quiescent: A task is becoming not runnable on its associated CPU * @p: task becoming not runnable * @deq_flags: %SCX_DEQ_* * @@ -363,7 +404,7 @@ struct sched_ext_ops { void (*quiescent)(struct task_struct *p, u64 deq_flags); /** - * yield - Yield CPU + * @yield: Yield CPU * @from: yielding task * @to: optional yield target task * @@ -378,7 +419,7 @@ struct sched_ext_ops { bool (*yield)(struct task_struct *from, struct task_struct *to); /** - * core_sched_before - Task ordering for core-sched + * @core_sched_before: Task ordering for core-sched * @a: task A * @b: task B * @@ -396,7 +437,7 @@ struct sched_ext_ops { bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); /** - * set_weight - Set task weight + * @set_weight: Set task weight * @p: task to set weight for * @weight: new weight [1..10000] * @@ -405,7 +446,7 @@ struct sched_ext_ops { void (*set_weight)(struct task_struct *p, u32 weight); /** - * set_cpumask - Set CPU affinity + * @set_cpumask: Set CPU affinity * @p: task to set CPU affinity for * @cpumask: cpumask of cpus that @p can run on * @@ -415,8 +456,8 @@ struct sched_ext_ops { const struct cpumask *cpumask); /** - * update_idle - Update the idle state of a CPU - * @cpu: CPU to udpate the idle state for + * @update_idle: Update the idle state of a CPU + * @cpu: CPU to update the idle state for * @idle: whether entering or exiting the idle state * * This operation is called when @rq's CPU goes or leaves the idle @@ -436,7 +477,7 @@ struct sched_ext_ops { void (*update_idle)(s32 cpu, bool idle); /** - * cpu_acquire - A CPU is becoming available to the BPF scheduler + * @cpu_acquire: A CPU is becoming available to the BPF scheduler * @cpu: The CPU being acquired by the BPF scheduler. * @args: Acquire arguments, see the struct definition. * @@ -446,7 +487,7 @@ struct sched_ext_ops { void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); /** - * cpu_release - A CPU is taken away from the BPF scheduler + * @cpu_release: A CPU is taken away from the BPF scheduler * @cpu: The CPU being released by the BPF scheduler. * @args: Release arguments, see the struct definition. * @@ -458,7 +499,7 @@ struct sched_ext_ops { void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); /** - * init_task - Initialize a task to run in a BPF scheduler + * @init_task: Initialize a task to run in a BPF scheduler * @p: task to initialize for BPF scheduling * @args: init arguments, see the struct definition * @@ -473,8 +514,9 @@ struct sched_ext_ops { s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); /** - * exit_task - Exit a previously-running task from the system + * @exit_task: Exit a previously-running task from the system * @p: task to exit + * @args: exit arguments, see the struct definition * * @p is exiting or the BPF scheduler is being unloaded. Perform any * necessary cleanup for @p. @@ -482,7 +524,7 @@ struct sched_ext_ops { void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); /** - * enable - Enable BPF scheduling for a task + * @enable: Enable BPF scheduling for a task * @p: task to enable BPF scheduling for * * Enable @p for BPF scheduling. enable() is called on @p any time it @@ -491,7 +533,7 @@ struct sched_ext_ops { void (*enable)(struct task_struct *p); /** - * disable - Disable BPF scheduling for a task + * @disable: Disable BPF scheduling for a task * @p: task to disable BPF scheduling for * * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. @@ -501,7 +543,7 @@ struct sched_ext_ops { void (*disable)(struct task_struct *p); /** - * dump - Dump BPF scheduler state on error + * @dump: Dump BPF scheduler state on error * @ctx: debug dump context * * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. @@ -509,7 +551,7 @@ struct sched_ext_ops { void (*dump)(struct scx_dump_ctx *ctx); /** - * dump_cpu - Dump BPF scheduler state for a CPU on error + * @dump_cpu: Dump BPF scheduler state for a CPU on error * @ctx: debug dump context * @cpu: CPU to generate debug dump for * @idle: @cpu is currently idle without any runnable tasks @@ -521,7 +563,7 @@ struct sched_ext_ops { void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); /** - * dump_task - Dump BPF scheduler state for a runnable task on error + * @dump_task: Dump BPF scheduler state for a runnable task on error * @ctx: debug dump context * @p: runnable task to generate debug dump for * @@ -532,7 +574,7 @@ struct sched_ext_ops { #ifdef CONFIG_EXT_GROUP_SCHED /** - * cgroup_init - Initialize a cgroup + * @cgroup_init: Initialize a cgroup * @cgrp: cgroup being initialized * @args: init arguments, see the struct definition * @@ -547,7 +589,7 @@ struct sched_ext_ops { struct scx_cgroup_init_args *args); /** - * cgroup_exit - Exit a cgroup + * @cgroup_exit: Exit a cgroup * @cgrp: cgroup being exited * * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit @@ -556,7 +598,7 @@ struct sched_ext_ops { void (*cgroup_exit)(struct cgroup *cgrp); /** - * cgroup_prep_move - Prepare a task to be moved to a different cgroup + * @cgroup_prep_move: Prepare a task to be moved to a different cgroup * @p: task being moved * @from: cgroup @p is being moved from * @to: cgroup @p is being moved to @@ -571,7 +613,7 @@ struct sched_ext_ops { struct cgroup *from, struct cgroup *to); /** - * cgroup_move - Commit cgroup move + * @cgroup_move: Commit cgroup move * @p: task being moved * @from: cgroup @p is being moved from * @to: cgroup @p is being moved to @@ -582,7 +624,7 @@ struct sched_ext_ops { struct cgroup *from, struct cgroup *to); /** - * cgroup_cancel_move - Cancel cgroup move + * @cgroup_cancel_move: Cancel cgroup move * @p: task whose cgroup move is being canceled * @from: cgroup @p was being moved from * @to: cgroup @p was being moved to @@ -594,7 +636,7 @@ struct sched_ext_ops { struct cgroup *from, struct cgroup *to); /** - * cgroup_set_weight - A cgroup's weight is being changed + * @cgroup_set_weight: A cgroup's weight is being changed * @cgrp: cgroup whose weight is being updated * @weight: new weight [1..10000] * @@ -608,7 +650,7 @@ struct sched_ext_ops { */ /** - * cpu_online - A CPU became online + * @cpu_online: A CPU became online * @cpu: CPU which just came up * * @cpu just came online. @cpu will not call ops.enqueue() or @@ -617,7 +659,7 @@ struct sched_ext_ops { void (*cpu_online)(s32 cpu); /** - * cpu_offline - A CPU is going offline + * @cpu_offline: A CPU is going offline * @cpu: CPU which is going offline * * @cpu is going offline. @cpu will not call ops.enqueue() or @@ -630,12 +672,12 @@ struct sched_ext_ops { */ /** - * init - Initialize the BPF scheduler + * @init: Initialize the BPF scheduler */ s32 (*init)(void); /** - * exit - Clean up after the BPF scheduler + * @exit: Clean up after the BPF scheduler * @info: Exit info * * ops.exit() is also called on ops.init() failure, which is a bit @@ -645,17 +687,17 @@ struct sched_ext_ops { void (*exit)(struct scx_exit_info *info); /** - * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch + * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch */ u32 dispatch_max_batch; /** - * flags - %SCX_OPS_* flags + * @flags: %SCX_OPS_* flags */ u64 flags; /** - * timeout_ms - The maximum amount of time, in milliseconds, that a + * @timeout_ms: The maximum amount of time, in milliseconds, that a * runnable task should be able to wait before being scheduled. The * maximum timeout may not exceed the default timeout of 30 seconds. * @@ -664,13 +706,13 @@ struct sched_ext_ops { u32 timeout_ms; /** - * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default + * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default * value of 32768 is used. */ u32 exit_dump_len; /** - * hotplug_seq - A sequence number that may be set by the scheduler to + * @hotplug_seq: A sequence number that may be set by the scheduler to * detect when a hotplug event has occurred during the loading process. * If 0, no detection occurs. Otherwise, the scheduler will fail to * load if the sequence number does not match @scx_hotplug_seq on the @@ -679,7 +721,7 @@ struct sched_ext_ops { u64 hotplug_seq; /** - * name - BPF scheduler's name + * @name: BPF scheduler's name * * Must be a non-zero valid BPF object name including only isalnum(), * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the @@ -764,6 +806,7 @@ enum scx_deq_flags { enum scx_pick_idle_cpu_flags { SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ }; enum scx_kick_flags { @@ -879,15 +922,11 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all); static struct sched_ext_ops scx_ops; static bool scx_warned_zero_slice; +DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); +static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled); static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); -static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); - -#ifdef CONFIG_SMP -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); -static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); -#endif static struct static_key_false scx_has_op[SCX_OPI_END] = { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; @@ -922,21 +961,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; static struct delayed_work scx_watchdog_work; -/* idle tracking */ -#ifdef CONFIG_SMP -#ifdef CONFIG_CPUMASK_OFFSTACK -#define CL_ALIGNED_IF_ONSTACK -#else -#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp -#endif - -static struct { - cpumask_var_t cpu; - cpumask_var_t smt; -} idle_masks CL_ALIGNED_IF_ONSTACK; - -#endif /* CONFIG_SMP */ - /* for %SCX_KICK_WAIT */ static unsigned long __percpu *scx_kick_cpus_pnt_seqs; @@ -960,7 +984,7 @@ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); static struct scx_dispatch_q **global_dsqs; static const struct rhashtable_params dsq_hash_params = { - .key_len = 8, + .key_len = sizeof_field(struct scx_dispatch_q, id), .key_offset = offsetof(struct scx_dispatch_q, id), .head_offset = offsetof(struct scx_dispatch_q, hash_node), }; @@ -1213,7 +1237,7 @@ static bool scx_kf_allowed_if_unlocked(void) /** * nldsq_next_task - Iterate to the next task in a non-local DSQ - * @dsq: user dsq being interated + * @dsq: user dsq being iterated * @cur: current position, %NULL to start iteration * @rev: walk backwards * @@ -1408,7 +1432,6 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) /** * scx_task_iter_next_locked - Next non-idle task with its rq locked * @iter: iterator to walk - * @include_dead: Whether we should include dead tasks in the iteration * * Visit the non-idle task with its rq lock held. Allows callers to specify * whether they would like to filter out dead tasks. See scx_task_iter_start() @@ -1458,6 +1481,117 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return p; } +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + s64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + s64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + s64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + + /* + * The total number of tasks enqueued (or pick_task-ed) with a + * default time slice (SCX_SLICE_DFL). + */ + s64 SCX_EV_ENQ_SLICE_DFL; + + /* + * The total duration of bypass modes in nanoseconds. + */ + s64 SCX_EV_BYPASS_DURATION; + + /* + * The number of tasks dispatched in the bypassing mode. + */ + s64 SCX_EV_BYPASS_DISPATCH; + + /* + * The number of times the bypassing mode has been activated. + */ + s64 SCX_EV_BYPASS_ACTIVATE; +}; + +/* + * The event counter is organized by a per-CPU variable to minimize the + * accounting overhead without synchronization. A system-wide view on the + * event counter is constructed when requested by scx_bpf_get_event_stat(). + */ +static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu); + +/** + * scx_add_event - Increase an event counter for 'name' by 'cnt' + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This can be used when preemption is not disabled. + */ +#define scx_add_event(name, cnt) do { \ + this_cpu_add(event_stats_cpu.name, cnt); \ + trace_sched_ext_event(#name, cnt); \ +} while(0) + +/** + * __scx_add_event - Increase an event counter for 'name' by 'cnt' + * @name: an event name defined in struct scx_event_stats + * @cnt: the number of the event occured + * + * This should be used only when preemption is disabled. + */ +#define __scx_add_event(name, cnt) do { \ + __this_cpu_add(event_stats_cpu.name, cnt); \ + trace_sched_ext_event(#name, cnt); \ +} while(0) + +/** + * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e' + * @dst_e: destination event stats + * @src_e: source event stats + * @kind: a kind of event to be aggregated + */ +#define scx_agg_event(dst_e, src_e, kind) do { \ + (dst_e)->kind += READ_ONCE((src_e)->kind); \ +} while(0) + +/** + * scx_dump_event - Dump an event 'kind' in 'events' to 's' + * @s: output seq_buf + * @events: event stats + * @kind: a kind of event to dump + */ +#define scx_dump_event(s, events, kind) do { \ + dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \ +} while (0) + + +static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz); + static enum scx_ops_enable_state scx_ops_enable_state(void) { return atomic_read(&scx_ops_enable_state_var); @@ -2003,16 +2137,27 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (scx_rq_bypassing(rq)) + if (scx_rq_bypassing(rq)) { + __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); goto global; + } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct; /* see %SCX_OPS_ENQ_EXITING */ if (!static_branch_unlikely(&scx_ops_enq_exiting) && - unlikely(p->flags & PF_EXITING)) + unlikely(p->flags & PF_EXITING)) { + __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1); + goto local; + } + + /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */ + if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) && + is_migration_disabled(p)) { + __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1); goto local; + } if (!SCX_HAS_OP(enqueue)) goto global; @@ -2052,6 +2197,7 @@ local: */ touch_core_sched(rq, p); p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); local_norefill: dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); return; @@ -2059,6 +2205,7 @@ local_norefill: global: touch_core_sched(rq, p); /* see the comment in local: */ p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); dispatch_enqueue(find_global_dsq(p), p, enq_flags); } @@ -2078,7 +2225,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p) /* * list_add_tail() must be used. scx_ops_bypass() depends on tasks being - * appened to the runnable_list. + * appended to the runnable_list. */ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); } @@ -2130,6 +2277,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags do_enqueue_task(rq, p, enq_flags, sticky_cpu); out: rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; + + if ((enq_flags & SCX_ENQ_CPU_SELECTED) && + unlikely(cpu_of(rq) != p->scx.selected_cpu)) + __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1); } static void ops_dequeue(struct task_struct *p, u64 deq_flags) @@ -2313,12 +2464,35 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, * * - The BPF scheduler is bypassed while the rq is offline and we can always say * no to the BPF scheduler initiated migrations while offline. + * + * The caller must ensure that @p and @rq are on different CPUs. */ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, - bool trigger_error) + bool enforce) { int cpu = cpu_of(rq); + WARN_ON_ONCE(task_cpu(p) == cpu); + + /* + * If @p has migration disabled, @p->cpus_ptr is updated to contain only + * the pinned CPU in migrate_disable_switch() while @p is being switched + * out. However, put_prev_task_scx() is called before @p->cpus_ptr is + * updated and thus another CPU may see @p on a DSQ inbetween leading to + * @p passing the below task_allowed_on_cpu() check while migration is + * disabled. + * + * Test the migration disabled state first as the race window is narrow + * and the BPF scheduler failing to check migration disabled state can + * easily be masked if task_allowed_on_cpu() is done first. + */ + if (unlikely(is_migration_disabled(p))) { + if (enforce) + scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d", + p->comm, p->pid, task_cpu(p), cpu); + return false; + } + /* * We don't require the BPF scheduler to avoid dispatching to offline * CPUs mostly for convenience but also because CPUs can go offline @@ -2326,17 +2500,17 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, * picked CPU is outside the allowed mask. */ if (!task_allowed_on_cpu(p, cpu)) { - if (trigger_error) - scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", - cpu_of(rq), p->comm, p->pid); + if (enforce) + scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]", + cpu, p->comm, p->pid); return false; } - if (unlikely(is_migration_disabled(p))) - return false; - - if (!scx_rq_online(rq)) + if (!scx_rq_online(rq)) { + if (enforce) + __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; + } return true; } @@ -2406,7 +2580,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, } #else /* CONFIG_SMP */ static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } -static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } +static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; } static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } #endif /* CONFIG_SMP */ @@ -2437,7 +2611,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, if (dst_dsq->id == SCX_DSQ_LOCAL) { dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); - if (!task_can_run_on_remote_rq(p, dst_rq, true)) { + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dst_dsq = find_global_dsq(p); dst_rq = src_rq; } @@ -2480,7 +2655,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags, /* * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artifical delays while the + * to the bypass mode can take a long time. Inject artificial delays while the * bypass mode is switching to guarantee timely completion. */ static void scx_ops_breather(struct rq *rq) @@ -2575,6 +2750,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, { struct rq *src_rq = task_rq(p); struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); +#ifdef CONFIG_SMP + struct rq *locked_rq = rq; +#endif /* * We're synchronized against dequeue through DISPATCHING. As @p can't @@ -2588,7 +2766,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } #ifdef CONFIG_SMP - if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { + if (src_rq != dst_rq && + unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { dispatch_enqueue(find_global_dsq(p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; @@ -2611,8 +2790,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); /* switch to @src_rq lock */ - if (rq != src_rq) { - raw_spin_rq_unlock(rq); + if (locked_rq != src_rq) { + raw_spin_rq_unlock(locked_rq); + locked_rq = src_rq; raw_spin_rq_lock(src_rq); } @@ -2630,6 +2810,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } else { move_remote_task_to_local_dsq(p, enq_flags, src_rq, dst_rq); + /* task has been moved to dst_rq, which is now locked */ + locked_rq = dst_rq; } /* if the destination CPU is idle, wake it up */ @@ -2638,8 +2820,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq, } /* switch back to @rq lock */ - if (rq != dst_rq) { - raw_spin_rq_unlock(dst_rq); + if (locked_rq != rq) { + raw_spin_rq_unlock(locked_rq); raw_spin_rq_lock(rq); } #else /* CONFIG_SMP */ @@ -2845,6 +3027,7 @@ no_tasks: if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || scx_rq_bypassing(rq))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; + __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1); goto has_tasks; } rq->scx.flags &= ~SCX_RQ_IN_BALANCE; @@ -3069,7 +3252,6 @@ static struct task_struct *pick_task_scx(struct rq *rq) { struct task_struct *prev = rq->curr; struct task_struct *p; - bool prev_on_scx = prev->sched_class == &ext_sched_class; bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; bool kick_idle = false; @@ -3089,14 +3271,18 @@ static struct task_struct *pick_task_scx(struct rq *rq) * if pick_task_scx() is called without preceding balance_scx(). */ if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { - if (prev_on_scx) { + if (prev->scx.flags & SCX_TASK_QUEUED) { keep_prev = true; } else { keep_prev = false; kick_idle = true; } - } else if (unlikely(keep_prev && !prev_on_scx)) { - /* only allowed during transitions */ + } else if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { + /* + * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is + * conditional on scx_enabled() and may have been skipped. + */ WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED); keep_prev = false; } @@ -3108,8 +3294,10 @@ static struct task_struct *pick_task_scx(struct rq *rq) */ if (keep_prev) { p = prev; - if (!p->scx.slice) + if (!p->scx.slice) { p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); + } } else { p = first_local_task(rq); if (!p) { @@ -3125,6 +3313,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) scx_warned_zero_slice = true; } p->scx.slice = SCX_SLICE_DFL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); } } @@ -3136,6 +3325,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) * scx_prio_less - Task ordering for core-sched * @a: task A * @b: task B + * @in_fi: in forced idle state * * Core-sched is implemented as an additional scheduling layer on top of the * usual sched_class'es and needs to find out the expected task ordering. For @@ -3143,7 +3333,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) * * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used * to implement the default task ordering. The older the timestamp, the higher - * prority the task - the global FIFO ordering matching the default scheduling + * priority the task - the global FIFO ordering matching the default scheduling * behavior. * * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to @@ -3168,354 +3358,10 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, #ifdef CONFIG_SMP -static bool test_and_clear_cpu_idle(int cpu) -{ -#ifdef CONFIG_SCHED_SMT - /* - * SMT mask should be cleared whether we can claim @cpu or not. The SMT - * cluster is not wholly idle either way. This also prevents - * scx_pick_idle_cpu() from getting caught in an infinite loop. - */ - if (sched_smt_active()) { - const struct cpumask *smt = cpu_smt_mask(cpu); - - /* - * If offline, @cpu is not its own sibling and - * scx_pick_idle_cpu() can get caught in an infinite loop as - * @cpu is never cleared from idle_masks.smt. Ensure that @cpu - * is eventually cleared. - */ - if (cpumask_intersects(smt, idle_masks.smt)) - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - else if (cpumask_test_cpu(cpu, idle_masks.smt)) - __cpumask_clear_cpu(cpu, idle_masks.smt); - } -#endif - return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); -} - -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) -{ - int cpu; - -retry: - if (sched_smt_active()) { - cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); - if (cpu < nr_cpu_ids) - goto found; - - if (flags & SCX_PICK_IDLE_CORE) - return -EBUSY; - } - - cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); - if (cpu >= nr_cpu_ids) - return -EBUSY; - -found: - if (test_and_clear_cpu_idle(cpu)) - return cpu; - else - goto retry; -} - -/* - * Return true if the LLC domains do not perfectly overlap with the NUMA - * domains, false otherwise. - */ -static bool llc_numa_mismatch(void) -{ - int cpu; - - /* - * We need to scan all online CPUs to verify whether their scheduling - * domains overlap. - * - * While it is rare to encounter architectures with asymmetric NUMA - * topologies, CPU hotplugging or virtualized environments can result - * in asymmetric configurations. - * - * For example: - * - * NUMA 0: - * - LLC 0: cpu0..cpu7 - * - LLC 1: cpu8..cpu15 [offline] - * - * NUMA 1: - * - LLC 0: cpu16..cpu23 - * - LLC 1: cpu24..cpu31 - * - * In this case, if we only check the first online CPU (cpu0), we might - * incorrectly assume that the LLC and NUMA domains are fully - * overlapping, which is incorrect (as NUMA 1 has two distinct LLC - * domains). - */ - for_each_online_cpu(cpu) { - const struct cpumask *numa_cpus; - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (!sd) - return true; - - numa_cpus = cpumask_of_node(cpu_to_node(cpu)); - if (sd->span_weight != cpumask_weight(numa_cpus)) - return true; - } - - return false; -} - -/* - * Initialize topology-aware scheduling. - * - * Detect if the system has multiple LLC or multiple NUMA domains and enable - * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle - * selection policy. - * - * Assumption: the kernel's internal topology representation assumes that each - * CPU belongs to a single LLC domain, and that each LLC domain is entirely - * contained within a single NUMA node. - */ -static void update_selcpu_topology(void) -{ - bool enable_llc = false, enable_numa = false; - struct sched_domain *sd; - const struct cpumask *cpus; - s32 cpu = cpumask_first(cpu_online_mask); - - /* - * Enable LLC domain optimization only when there are multiple LLC - * domains among the online CPUs. If all online CPUs are part of a - * single LLC domain, the idle CPU selection logic can choose any - * online CPU without bias. - * - * Note that it is sufficient to check the LLC domain of the first - * online CPU to determine whether a single LLC domain includes all - * CPUs. - */ - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (sd) { - if (sd->span_weight < num_online_cpus()) - enable_llc = true; - } - - /* - * Enable NUMA optimization only when there are multiple NUMA domains - * among the online CPUs and the NUMA domains don't perfectly overlaps - * with the LLC domains. - * - * If all CPUs belong to the same NUMA node and the same LLC domain, - * enabling both NUMA and LLC optimizations is unnecessary, as checking - * for an idle CPU in the same domain twice is redundant. - */ - cpus = cpumask_of_node(cpu_to_node(cpu)); - if ((cpumask_weight(cpus) < num_online_cpus()) && llc_numa_mismatch()) - enable_numa = true; - rcu_read_unlock(); - - pr_debug("sched_ext: LLC idle selection %s\n", - enable_llc ? "enabled" : "disabled"); - pr_debug("sched_ext: NUMA idle selection %s\n", - enable_numa ? "enabled" : "disabled"); - - if (enable_llc) - static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); - else - static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); - if (enable_numa) - static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); - else - static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); -} - -/* - * Built-in CPU idle selection policy: - * - * 1. Prioritize full-idle cores: - * - always prioritize CPUs from fully idle cores (both logical CPUs are - * idle) to avoid interference caused by SMT. - * - * 2. Reuse the same CPU: - * - prefer the last used CPU to take advantage of cached data (L1, L2) and - * branch prediction optimizations. - * - * 3. Pick a CPU within the same LLC (Last-Level Cache): - * - if the above conditions aren't met, pick a CPU that shares the same LLC - * to maintain cache locality. - * - * 4. Pick a CPU within the same NUMA node, if enabled: - * - choose a CPU from the same NUMA node to reduce memory access latency. - * - * Step 3 and 4 are performed only if the system has, respectively, multiple - * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and - * scx_selcpu_topo_numa). - * - * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because - * we never call ops.select_cpu() for them, see select_task_rq(). - */ -static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *found) -{ - const struct cpumask *llc_cpus = NULL; - const struct cpumask *numa_cpus = NULL; - s32 cpu; - - *found = false; - - - /* - * This is necessary to protect llc_cpus. - */ - rcu_read_lock(); - - /* - * Determine the scheduling domain only if the task is allowed to run - * on all CPUs. - * - * This is done primarily for efficiency, as it avoids the overhead of - * updating a cpumask every time we need to select an idle CPU (which - * can be costly in large SMP systems), but it also aligns logically: - * if a task's scheduling domain is restricted by user-space (through - * CPU affinity), the task will simply use the flat scheduling domain - * defined by user-space. - */ - if (p->nr_cpus_allowed >= num_possible_cpus()) { - if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) - numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu)); - - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) { - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, prev_cpu)); - if (sd) - llc_cpus = sched_domain_span(sd); - } - } - - /* - * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. - */ - if (wake_flags & SCX_WAKE_SYNC) { - cpu = smp_processor_id(); - - /* - * If the waker's CPU is cache affine and prev_cpu is idle, - * then avoid a migration. - */ - if (cpus_share_cache(cpu, prev_cpu) && - test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * If the waker's local DSQ is empty, and the system is under - * utilized, try to wake up @p to the local DSQ of the waker. - * - * Checking only for an empty local DSQ is insufficient as it - * could give the wakee an unfair advantage when the system is - * oversaturated. - * - * Checking only for the presence of idle CPUs is also - * insufficient as the local DSQ of the waker could have tasks - * piled up on it even if there is an idle core elsewhere on - * the system. - */ - if (!cpumask_empty(idle_masks.cpu) && - !(current->flags & PF_EXITING) && - cpu_rq(cpu)->scx.local_dsq.nr == 0) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) - goto cpu_found; - } - } - - /* - * If CPU has SMT, any wholly idle CPU is likely a better pick than - * partially idle @prev_cpu. - */ - if (sched_smt_active()) { - /* - * Keep using @prev_cpu if it's part of a fully idle core. - */ - if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && - test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * Search for any fully idle core in the same LLC domain. - */ - if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any fully idle core in the same NUMA node. - */ - if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any full idle core usable by the task. - */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Use @prev_cpu if it's idle. - */ - if (test_and_clear_cpu_idle(prev_cpu)) { - cpu = prev_cpu; - goto cpu_found; - } - - /* - * Search for any idle CPU in the same LLC domain. - */ - if (llc_cpus) { - cpu = scx_pick_idle_cpu(llc_cpus, 0); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any idle CPU in the same NUMA node. - */ - if (numa_cpus) { - cpu = scx_pick_idle_cpu(numa_cpus, 0); - if (cpu >= 0) - goto cpu_found; - } - - /* - * Search for any idle CPU usable by the task. - */ - cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); - if (cpu >= 0) - goto cpu_found; - - rcu_read_unlock(); - return prev_cpu; - -cpu_found: - rcu_read_unlock(); - - *found = true; - return cpu; -} - static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { + bool rq_bypass; + /* * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it * can be a good migration opportunity with low cache and memory @@ -3529,7 +3375,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) { + rq_bypass = scx_rq_bypassing(task_rq(p)); + if (SCX_HAS_OP(select_cpu) && !rq_bypass) { s32 cpu; struct task_struct **ddsp_taskp; @@ -3539,20 +3386,27 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, select_cpu, p, prev_cpu, wake_flags); + p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; if (ops_cpu_valid(cpu, "from ops.select_cpu()")) return cpu; else return prev_cpu; } else { - bool found; s32 cpu; - cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); - if (found) { + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); + if (cpu >= 0) { p->scx.slice = SCX_SLICE_DFL; p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; + __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1); + } else { + cpu = prev_cpu; } + p->scx.selected_cpu = cpu; + + if (rq_bypass) + __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1); return cpu; } } @@ -3580,95 +3434,6 @@ static void set_cpus_allowed_scx(struct task_struct *p, (struct cpumask *)p->cpus_ptr); } -static void reset_idle_masks(void) -{ - /* - * Consider all online cpus idle. Should converge to the actual state - * quickly. - */ - cpumask_copy(idle_masks.cpu, cpu_online_mask); - cpumask_copy(idle_masks.smt, cpu_online_mask); -} - -static void update_builtin_idle(int cpu, bool idle) -{ - if (idle) - cpumask_set_cpu(cpu, idle_masks.cpu); - else - cpumask_clear_cpu(cpu, idle_masks.cpu); - -#ifdef CONFIG_SCHED_SMT - if (sched_smt_active()) { - const struct cpumask *smt = cpu_smt_mask(cpu); - - if (idle) { - /* - * idle_masks.smt handling is racy but that's fine as - * it's only for optimization and self-correcting. - */ - for_each_cpu(cpu, smt) { - if (!cpumask_test_cpu(cpu, idle_masks.cpu)) - return; - } - cpumask_or(idle_masks.smt, idle_masks.smt, smt); - } else { - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); - } - } -#endif -} - -/* - * Update the idle state of a CPU to @idle. - * - * If @do_notify is true, ops.update_idle() is invoked to notify the scx - * scheduler of an actual idle state transition (idle to busy or vice - * versa). If @do_notify is false, only the idle state in the idle masks is - * refreshed without invoking ops.update_idle(). - * - * This distinction is necessary, because an idle CPU can be "reserved" and - * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as - * busy even if no tasks are dispatched. In this case, the CPU may return - * to idle without a true state transition. Refreshing the idle masks - * without invoking ops.update_idle() ensures accurate idle state tracking - * while avoiding unnecessary updates and maintaining balanced state - * transitions. - */ -void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) -{ - int cpu = cpu_of(rq); - - lockdep_assert_rq_held(rq); - - /* - * Trigger ops.update_idle() only when transitioning from a task to - * the idle thread and vice versa. - * - * Idle transitions are indicated by do_notify being set to true, - * managed by put_prev_task_idle()/set_next_task_idle(). - */ - if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); - - /* - * Update the idle masks: - * - for real idle transitions (do_notify == true) - * - for idle-to-idle transitions (indicated by the previous task - * being the idle thread, managed by pick_task_idle()) - * - * Skip updating idle masks if the previous task is not the idle - * thread, since set_next_task_idle() has already handled it when - * transitioning from a task to the idle thread (calling this - * function with do_notify == true). - * - * In this way we can avoid updating the idle masks twice, - * unnecessarily. - */ - if (static_branch_likely(&scx_builtin_idle_enabled)) - if (do_notify || is_idle_task(rq->curr)) - update_builtin_idle(cpu, idle); -} - static void handle_hotplug(struct rq *rq, bool online) { int cpu = cpu_of(rq); @@ -3676,7 +3441,7 @@ static void handle_hotplug(struct rq *rq, bool online) atomic_long_inc(&scx_hotplug_seq); if (scx_enabled()) - update_selcpu_topology(); + scx_idle_update_selcpu_topology(&scx_ops); if (online && SCX_HAS_OP(cpu_online)) SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu); @@ -3708,12 +3473,6 @@ static void rq_offline_scx(struct rq *rq) rq->scx.flags &= ~SCX_RQ_ONLINE; } -#else /* CONFIG_SMP */ - -static bool test_and_clear_cpu_idle(int cpu) { return false; } -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } -static void reset_idle_masks(void) {} - #endif /* CONFIG_SMP */ static bool check_rq_for_timeouts(struct rq *rq) @@ -3791,7 +3550,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) curr->scx.slice = 0; touch_core_sched(rq, curr); } else if (SCX_HAS_OP(tick)) { - SCX_CALL_OP(SCX_KF_REST, tick, curr); + SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr); } if (!curr->scx.slice) @@ -3938,7 +3697,7 @@ static void scx_ops_disable_task(struct task_struct *p) WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); if (SCX_HAS_OP(disable)) - SCX_CALL_OP(SCX_KF_REST, disable, p); + SCX_CALL_OP_TASK(SCX_KF_REST, disable, p); scx_set_task_state(p, SCX_TASK_READY); } @@ -3967,7 +3726,7 @@ static void scx_ops_exit_task(struct task_struct *p) } if (SCX_HAS_OP(exit_task)) - SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); + SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args); scx_set_task_state(p, SCX_TASK_NONE); } @@ -4263,25 +4022,12 @@ err: return ops_sanitize_err("cgroup_prep_move", ret); } -void scx_move_task(struct task_struct *p) +void scx_cgroup_move_task(struct task_struct *p) { if (!scx_cgroup_enabled) return; /* - * We're called from sched_move_task() which handles both cgroup and - * autogroup moves. Ignore the latter. - * - * Also ignore exiting tasks, because in the exit path tasks transition - * from the autogroup to the root group, so task_group_is_autogroup() - * alone isn't able to catch exiting autogroup tasks. This is safe for - * cgroup_move(), because cgroup migrations never happen for PF_EXITING - * tasks. - */ - if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) - return; - - /* * @p must have ops.cgroup_prep_move() called on it and thus * cgrp_moving_from set. */ @@ -4530,7 +4276,7 @@ static int scx_cgroup_init(void) cgroup_warned_missing_idle = false; /* - * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and init, all online cgroups are initialized. */ rcu_read_lock(); @@ -4651,8 +4397,33 @@ static ssize_t scx_attr_ops_show(struct kobject *kobj, } SCX_ATTR(ops); +#define scx_attr_event_show(buf, at, events, kind) ({ \ + sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \ +}) + +static ssize_t scx_attr_events_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + struct scx_event_stats events; + int at = 0; + + scx_bpf_events(&events, sizeof(events)); + at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); + at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); + return at; +} +SCX_ATTR(events); + static struct attribute *scx_sched_attrs[] = { &scx_attr_ops.attr, + &scx_attr_events.attr, NULL, }; ATTRIBUTE_GROUPS(scx_sched); @@ -4688,6 +4459,7 @@ bool task_should_scx(int policy) /** * scx_softlockup - sched_ext softlockup handler + * @dur_s: number of seconds of CPU stuck due to soft lockup * * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can * live-lock the system by making many CPUs target the same DSQ to the point @@ -4731,6 +4503,7 @@ static void scx_clear_softlockup(void) /** * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress + * @bypass: true for bypass, false for unbypass * * Bypassing guarantees that all runnable tasks make forward progress without * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might @@ -4762,6 +4535,8 @@ static void scx_clear_softlockup(void) static void scx_ops_bypass(bool bypass) { static DEFINE_RAW_SPINLOCK(bypass_lock); + static unsigned long bypass_timestamp; + int cpu; unsigned long flags; @@ -4771,11 +4546,15 @@ static void scx_ops_bypass(bool bypass) WARN_ON_ONCE(scx_ops_bypass_depth <= 0); if (scx_ops_bypass_depth != 1) goto unlock; + bypass_timestamp = ktime_get_ns(); + scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1); } else { scx_ops_bypass_depth--; WARN_ON_ONCE(scx_ops_bypass_depth < 0); if (scx_ops_bypass_depth != 0) goto unlock; + scx_add_event(SCX_EV_BYPASS_DURATION, + ktime_get_ns() - bypass_timestamp); } atomic_inc(&scx_ops_breather_depth); @@ -4899,7 +4678,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) struct task_struct *p; struct rhashtable_iter rht_iter; struct scx_dispatch_q *dsq; - int i, kind; + int i, kind, cpu; kind = atomic_read(&scx_exit_kind); while (true) { @@ -4982,14 +4761,25 @@ static void scx_ops_disable_workfn(struct kthread_work *work) scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); + /* + * Invalidate all the rq clocks to prevent getting outdated + * rq clocks from a previous scx scheduler. + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + scx_rq_clock_invalidate(rq); + } + /* no task is on scx, turn off all the switches and flush in-progress calls */ static_branch_disable(&__scx_ops_enabled); for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) static_branch_disable(&scx_has_op[i]); + static_branch_disable(&scx_ops_allow_queued_wakeup); static_branch_disable(&scx_ops_enq_last); static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_enq_migration_disabled); static_branch_disable(&scx_ops_cpu_preempt); - static_branch_disable(&scx_builtin_idle_enabled); + scx_idle_disable(); synchronize_rcu(); if (ei->kind >= SCX_EXIT_ERROR) { @@ -5206,9 +4996,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); - dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu", - p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, - p->scx.dsq_vtime); + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", + p->scx.dsq_vtime, p->scx.slice, p->scx.weight); dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); if (SCX_HAS_OP(dump_task)) { @@ -5238,6 +5029,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) .at_jiffies = jiffies, }; struct seq_buf s; + struct scx_event_stats events; unsigned long flags; char *buf; int cpu; @@ -5346,6 +5138,21 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) rq_unlock(rq, &rf); } + dump_newline(&s); + dump_line(&s, "Event counters"); + dump_line(&s, "--------------"); + + scx_bpf_events(&events, sizeof(events)); + scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK); + scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); + scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL); + scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); + scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); + scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); + if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), trunc_marker, sizeof(trunc_marker)); @@ -5399,7 +5206,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name) { struct kthread_worker *helper; - helper = kthread_create_worker(0, name); + helper = kthread_run_worker(0, name); if (helper) sched_set_fifo(helper->task); return helper; @@ -5435,6 +5242,16 @@ static int validate_ops(const struct sched_ext_ops *ops) return -EINVAL; } + /* + * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle + * selection policy to be enabled. + */ + if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) && + (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled"); + return -EINVAL; + } + return 0; } @@ -5453,6 +5270,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) mutex_lock(&scx_ops_enable_mutex); + /* + * Clear event counters so a new scx scheduler gets + * fresh event counter values. + */ + for_each_possible_cpu(cpu) { + struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu); + memset(e, 0, sizeof(*e)); + } + if (!scx_ops_helper) { WRITE_ONCE(scx_ops_helper, scx_create_rt_helper("sched_ext_ops_helper")); @@ -5550,9 +5376,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) static_branch_enable_cpuslocked(&scx_has_op[i]); check_hotplug_seq(ops); -#ifdef CONFIG_SMP - update_selcpu_topology(); -#endif + scx_idle_update_selcpu_topology(ops); + cpus_read_unlock(); ret = validate_ops(ops); @@ -5591,20 +5416,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) if (((void (**)(void))ops)[i]) static_branch_enable(&scx_has_op[i]); + if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) + static_branch_enable(&scx_ops_allow_queued_wakeup); if (ops->flags & SCX_OPS_ENQ_LAST) static_branch_enable(&scx_ops_enq_last); - if (ops->flags & SCX_OPS_ENQ_EXITING) static_branch_enable(&scx_ops_enq_exiting); + if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED) + static_branch_enable(&scx_ops_enq_migration_disabled); if (scx_ops.cpu_acquire || scx_ops.cpu_release) static_branch_enable(&scx_ops_cpu_preempt); - if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { - reset_idle_masks(); - static_branch_enable(&scx_builtin_idle_enabled); - } else { - static_branch_disable(&scx_builtin_idle_enabled); - } + scx_idle_enable(ops); /* * Lock out forks, cgroup on/offlining and moves before opening the @@ -6243,10 +6066,8 @@ void __init init_sched_ext_class(void) SCX_TG_ONLINE); BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); -#ifdef CONFIG_SMP - BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); - BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); -#endif + scx_idle_init_masks(); + scx_kick_cpus_pnt_seqs = __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, __alignof__(scx_kick_cpus_pnt_seqs[0])); @@ -6254,15 +6075,16 @@ void __init init_sched_ext_class(void) for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); + int n = cpu_to_node(cpu); init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); - BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); @@ -6279,55 +6101,6 @@ void __init init_sched_ext_class(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -#include <linux/btf_ids.h> - -__bpf_kfunc_start_defs(); - -/** - * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() - * @p: task_struct to select a CPU for - * @prev_cpu: CPU @p was on previously - * @wake_flags: %SCX_WAKE_* flags - * @is_idle: out parameter indicating whether the returned CPU is idle - * - * Can only be called from ops.select_cpu() if the built-in CPU selection is - * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. - * @p, @prev_cpu and @wake_flags match ops.select_cpu(). - * - * Returns the picked CPU with *@is_idle indicating whether the picked CPU is - * currently idle and thus a good candidate for direct dispatching. - */ -__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *is_idle) -{ - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - goto prev_cpu; - } - - if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) - goto prev_cpu; - -#ifdef CONFIG_SMP - return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -#endif - -prev_cpu: - *is_idle = false; - return prev_cpu; -} - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) -BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) - -static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { - .owner = THIS_MODULE, - .set = &scx_kfunc_ids_select_cpu, -}; - static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) { if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) @@ -6387,9 +6160,7 @@ __bpf_kfunc_start_defs(); * ops.select_cpu(), and ops.dispatch(). * * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch - * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be - * used to target the local DSQ of a CPU other than the enqueueing one. Use - * ops.select_cpu() to be on the target CPU in the first place. + * and @p must match the task being enqueued. * * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p * will be directly inserted into the corresponding dispatch queue after @@ -6851,8 +6622,12 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. + * + * Also skip re-enqueueing tasks that can only run on this + * CPU, as they would just be re-added to the same local + * DSQ without any benefit. */ - if (p->migration_pending) + if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) continue; dispatch_dequeue(rq, p); @@ -7228,7 +7003,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, } /** - * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler + * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler * @fmt: format string * @data: format string parameters packaged using ___bpf_fill() macro * @data__sz: @data len, must end in '__sz' for the verifier @@ -7320,7 +7095,6 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) * scx_bpf_cpuperf_set - Set the relative performance target of a CPU * @cpu: CPU of interest * @perf: target performance level [0, %SCX_CPUPERF_ONE] - * @flags: %SCX_CPUPERF_* flags * * Set the target performance level of @cpu to @perf. @perf is in linear * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the @@ -7350,6 +7124,16 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) } /** + * scx_bpf_nr_node_ids - Return the number of possible node IDs + * + * All valid node IDs in the system are smaller than the returned value. + */ +__bpf_kfunc u32 scx_bpf_nr_node_ids(void) +{ + return nr_node_ids; +} + +/** * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs * * All valid CPU IDs in the system are smaller than the returned value. @@ -7390,149 +7174,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) } /** - * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking - * per-CPU cpumask. - * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. - */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) -{ - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - return cpu_none_mask; - } - -#ifdef CONFIG_SMP - return idle_masks.cpu; -#else - return cpu_none_mask; -#endif -} - -/** - * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, - * per-physical-core cpumask. Can be used to determine if an entire physical - * core is free. - * - * Returns NULL if idle tracking is not enabled, or running on a UP kernel. - */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) -{ - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - return cpu_none_mask; - } - -#ifdef CONFIG_SMP - if (sched_smt_active()) - return idle_masks.smt; - else - return idle_masks.cpu; -#else - return cpu_none_mask; -#endif -} - -/** - * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to - * either the percpu, or SMT idle-tracking cpumask. - */ -__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) -{ - /* - * Empty function body because we aren't actually acquiring or releasing - * a reference to a global idle cpumask, which is read-only in the - * caller and is never released. The acquire / release semantics here - * are just used to make the cpumask a trusted pointer in the caller. - */ -} - -/** - * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state - * @cpu: cpu to test and clear idle for - * - * Returns %true if @cpu was idle and its idle state was successfully cleared. - * %false otherwise. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) -{ - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - return false; - } - - if (ops_cpu_valid(cpu, NULL)) - return test_and_clear_cpu_idle(cpu); - else - return false; -} - -/** - * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu - * number on success. -%EBUSY if no matching cpu was found. - * - * Idle CPU tracking may race against CPU scheduling state transitions. For - * example, this function may return -%EBUSY as CPUs are transitioning into the - * idle state. If the caller then assumes that there will be dispatch events on - * the CPUs as they were all busy, the scheduler may end up stalling with CPUs - * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and - * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch - * event in the near future. - * - * Unavailable if ops.update_idle() is implemented and - * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. - */ -__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); - return -EBUSY; - } - - return scx_pick_idle_cpu(cpus_allowed, flags); -} - -/** - * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU - * @cpus_allowed: Allowed cpumask - * @flags: %SCX_PICK_IDLE_CPU_* flags - * - * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any - * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu - * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is - * empty. - * - * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not - * set, this function can't tell which CPUs are idle and will always pick any - * CPU. - */ -__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, - u64 flags) -{ - s32 cpu; - - if (static_branch_likely(&scx_builtin_idle_enabled)) { - cpu = scx_pick_idle_cpu(cpus_allowed, flags); - if (cpu >= 0) - return cpu; - } - - cpu = cpumask_any_distribute(cpus_allowed); - if (cpu < nr_cpu_ids) - return cpu; - else - return -EBUSY; -} - -/** * scx_bpf_task_running - Is task currently running? * @p: task of interest */ @@ -7590,6 +7231,105 @@ out: } #endif +/** + * scx_bpf_now - Returns a high-performance monotonically non-decreasing + * clock for the current CPU. The clock returned is in nanoseconds. + * + * It provides the following properties: + * + * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently + * to account for execution time and track tasks' runtime properties. + * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which + * eventually reads a hardware timestamp counter -- is neither performant nor + * scalable. scx_bpf_now() aims to provide a high-performance clock by + * using the rq clock in the scheduler core whenever possible. + * + * 2) High enough resolution for the BPF scheduler use cases: In most BPF + * scheduler use cases, the required clock resolution is lower than the most + * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically + * uses the rq clock in the scheduler core whenever it is valid. It considers + * that the rq clock is valid from the time the rq clock is updated + * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). + * + * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() + * guarantees the clock never goes backward when comparing them in the same + * CPU. On the other hand, when comparing clocks in different CPUs, there + * is no such guarantee -- the clock can go backward. It provides a + * monotonically *non-decreasing* clock so that it would provide the same + * clock values in two different scx_bpf_now() calls in the same CPU + * during the same period of when the rq clock is valid. + */ +__bpf_kfunc u64 scx_bpf_now(void) +{ + struct rq *rq; + u64 clock; + + preempt_disable(); + + rq = this_rq(); + if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { + /* + * If the rq clock is valid, use the cached rq clock. + * + * Note that scx_bpf_now() is re-entrant between a process + * context and an interrupt context (e.g., timer interrupt). + * However, we don't need to consider the race between them + * because such race is not observable from a caller. + */ + clock = READ_ONCE(rq->scx.clock); + } else { + /* + * Otherwise, return a fresh rq clock. + * + * The rq clock is updated outside of the rq lock. + * In this case, keep the updated rq clock invalid so the next + * kfunc call outside the rq lock gets a fresh rq clock. + */ + clock = sched_clock_cpu(cpu_of(rq)); + } + + preempt_enable(); + + return clock; +} + +/* + * scx_bpf_events - Get a system-wide event counter to + * @events: output buffer from a BPF program + * @events__sz: @events len, must end in '__sz'' for the verifier + */ +__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, + size_t events__sz) +{ + struct scx_event_stats e_sys, *e_cpu; + int cpu; + + /* Aggregate per-CPU event counters into the system-wide counters. */ + memset(&e_sys, 0, sizeof(e_sys)); + for_each_possible_cpu(cpu) { + e_cpu = per_cpu_ptr(&event_stats_cpu, cpu); + scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); + scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); + scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH); + scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE); + } + + /* + * We cannot entirely trust a BPF-provided size since a BPF program + * might be compiled against a different vmlinux.h, of which + * scx_event_stats would be larger (a newer vmlinux.h) or smaller + * (an older vmlinux.h). Hence, we use the smaller size to avoid + * memory corruption. + */ + events__sz = min(events__sz, sizeof(*events)); + memcpy(events, &e_sys, events__sz); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) @@ -7605,6 +7345,7 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) +BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) @@ -7621,6 +7362,8 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq) #ifdef CONFIG_CGROUP_SCHED BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif +BTF_ID_FLAGS(func, scx_bpf_now) +BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { @@ -7644,8 +7387,6 @@ static int __init scx_init(void) * check using scx_kf_allowed(). */ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, - &scx_kfunc_set_select_cpu)) || - (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_enqueue_dispatch)) || (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_dispatch)) || @@ -7665,6 +7406,12 @@ static int __init scx_init(void) return ret; } + ret = scx_idle_init(); + if (ret) { + pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret); + return ret; + } + ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); if (ret) { pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 4d022d17ac7d..1bda96b19a1b 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -8,6 +8,8 @@ */ #ifdef CONFIG_SCHED_CLASS_EXT +DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); + void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); @@ -34,6 +36,13 @@ static inline bool task_on_scx(const struct task_struct *p) return scx_enabled() && p->sched_class == &ext_sched_class; } +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) +{ + return !scx_enabled() || + static_branch_likely(&scx_ops_allow_queued_wakeup) || + p->sched_class != &ext_sched_class; +} + #ifdef CONFIG_SCHED_CORE bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi); @@ -52,6 +61,7 @@ static inline void scx_rq_activate(struct rq *rq) {} static inline void scx_rq_deactivate(struct rq *rq) {} static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } static inline bool task_on_scx(const struct task_struct *p) { return false; } +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; } static inline void init_sched_ext_class(void) {} #endif /* CONFIG_SCHED_CLASS_EXT */ @@ -73,7 +83,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); -void scx_move_task(struct task_struct *p); +void scx_cgroup_move_task(struct task_struct *p); void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); @@ -82,7 +92,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle); static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static inline void scx_move_task(struct task_struct *p) {} +static inline void scx_cgroup_move_task(struct task_struct *p) {} static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c new file mode 100644 index 000000000000..52c36a70a3d0 --- /dev/null +++ b/kernel/sched/ext_idle.c @@ -0,0 +1,1171 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Built-in idle CPU tracking policy. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> + */ +#include "ext_idle.h" + +/* Enable/disable built-in idle CPU selection policy */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); + +/* Enable/disable per-node idle cpumasks */ +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node); + +#ifdef CONFIG_SMP +/* Enable/disable LLC aware optimizations */ +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); + +/* Enable/disable NUMA aware optimizations */ +static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa); + +/* + * cpumasks to track idle CPUs within each NUMA node. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask + * from is used to track all the idle CPUs in the system. + */ +struct scx_idle_cpus { + cpumask_var_t cpu; + cpumask_var_t smt; +}; + +/* + * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE + * is not enabled). + */ +static struct scx_idle_cpus scx_idle_global_masks; + +/* + * Per-node idle cpumasks. + */ +static struct scx_idle_cpus **scx_idle_node_masks; + +/* + * Return the idle masks associated to a target @node. + * + * NUMA_NO_NODE identifies the global idle cpumask. + */ +static struct scx_idle_cpus *idle_cpumask(int node) +{ + return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node]; +} + +/* + * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if + * per-node idle cpumasks are disabled. + */ +static int scx_cpu_node_if_enabled(int cpu) +{ + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +} + +bool scx_idle_test_and_clear_cpu(int cpu) +{ + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + +#ifdef CONFIG_SCHED_SMT + /* + * SMT mask should be cleared whether we can claim @cpu or not. The SMT + * cluster is not wholly idle either way. This also prevents + * scx_pick_idle_cpu() from getting caught in an infinite loop. + */ + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; + + /* + * If offline, @cpu is not its own sibling and + * scx_pick_idle_cpu() can get caught in an infinite loop as + * @cpu is never cleared from the idle SMT mask. Ensure that + * @cpu is eventually cleared. + * + * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to + * reduce memory writes, which may help alleviate cache + * coherence pressure. + */ + if (cpumask_intersects(smt, idle_smts)) + cpumask_andnot(idle_smts, idle_smts, smt); + else if (cpumask_test_cpu(cpu, idle_smts)) + __cpumask_clear_cpu(cpu, idle_smts); + } +#endif + + return cpumask_test_and_clear_cpu(cpu, idle_cpus); +} + +/* + * Pick an idle CPU in a specific NUMA node. + */ +static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + int cpu; + +retry: + if (sched_smt_active()) { + cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed); + if (cpu < nr_cpu_ids) + goto found; + + if (flags & SCX_PICK_IDLE_CORE) + return -EBUSY; + } + + cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed); + if (cpu >= nr_cpu_ids) + return -EBUSY; + +found: + if (scx_idle_test_and_clear_cpu(cpu)) + return cpu; + else + goto retry; +} + +/* + * Tracks nodes that have not yet been visited when searching for an idle + * CPU across all available nodes. + */ +static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited); + +/* + * Search for an idle CPU across all nodes, excluding @node. + */ +static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + nodemask_t *unvisited; + s32 cpu = -EBUSY; + + preempt_disable(); + unvisited = this_cpu_ptr(&per_cpu_unvisited); + + /* + * Restrict the search to the online nodes (excluding the current + * node that has been visited already). + */ + nodes_copy(*unvisited, node_states[N_ONLINE]); + node_clear(node, *unvisited); + + /* + * Traverse all nodes in order of increasing distance, starting + * from @node. + * + * This loop is O(N^2), with N being the amount of NUMA nodes, + * which might be quite expensive in large NUMA systems. However, + * this complexity comes into play only when a scheduler enables + * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU + * without specifying a target NUMA node, so it shouldn't be a + * bottleneck is most cases. + * + * As a future optimization we may want to cache the list of nodes + * in a per-node array, instead of actually traversing them every + * time. + */ + for_each_node_numadist(node, *unvisited) { + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + break; + } + preempt_enable(); + + return cpu; +} + +/* + * Find an idle CPU in the system, starting from @node. + */ +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + s32 cpu; + + /* + * Always search in the starting node first (this is an + * optimization that can save some cycles even when the search is + * not limited to a single node). + */ + cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + /* + * Stop the search if we are using only a single global cpumask + * (NUMA_NO_NODE) or if the search is restricted to the first node + * only. + */ + if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE) + return -EBUSY; + + /* + * Extend the search to the other online nodes. + */ + return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags); +} + +/* + * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC + * domain is not defined). + */ +static unsigned int llc_weight(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sd->span_weight; +} + +/* + * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC + * domain is not defined). + */ +static struct cpumask *llc_span(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sched_domain_span(sd); +} + +/* + * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the + * NUMA domain is not defined). + */ +static unsigned int numa_weight(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return 0; + sg = sd->groups; + if (!sg) + return 0; + + return sg->group_weight; +} + +/* + * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA + * domain is not defined). + */ +static struct cpumask *numa_span(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return NULL; + sg = sd->groups; + if (!sg) + return NULL; + + return sched_group_span(sg); +} + +/* + * Return true if the LLC domains do not perfectly overlap with the NUMA + * domains, false otherwise. + */ +static bool llc_numa_mismatch(void) +{ + int cpu; + + /* + * We need to scan all online CPUs to verify whether their scheduling + * domains overlap. + * + * While it is rare to encounter architectures with asymmetric NUMA + * topologies, CPU hotplugging or virtualized environments can result + * in asymmetric configurations. + * + * For example: + * + * NUMA 0: + * - LLC 0: cpu0..cpu7 + * - LLC 1: cpu8..cpu15 [offline] + * + * NUMA 1: + * - LLC 0: cpu16..cpu23 + * - LLC 1: cpu24..cpu31 + * + * In this case, if we only check the first online CPU (cpu0), we might + * incorrectly assume that the LLC and NUMA domains are fully + * overlapping, which is incorrect (as NUMA 1 has two distinct LLC + * domains). + */ + for_each_online_cpu(cpu) + if (llc_weight(cpu) != numa_weight(cpu)) + return true; + + return false; +} + +/* + * Initialize topology-aware scheduling. + * + * Detect if the system has multiple LLC or multiple NUMA domains and enable + * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle + * selection policy. + * + * Assumption: the kernel's internal topology representation assumes that each + * CPU belongs to a single LLC domain, and that each LLC domain is entirely + * contained within a single NUMA node. + */ +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) +{ + bool enable_llc = false, enable_numa = false; + unsigned int nr_cpus; + s32 cpu = cpumask_first(cpu_online_mask); + + /* + * Enable LLC domain optimization only when there are multiple LLC + * domains among the online CPUs. If all online CPUs are part of a + * single LLC domain, the idle CPU selection logic can choose any + * online CPU without bias. + * + * Note that it is sufficient to check the LLC domain of the first + * online CPU to determine whether a single LLC domain includes all + * CPUs. + */ + rcu_read_lock(); + nr_cpus = llc_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus()) + enable_llc = true; + pr_debug("sched_ext: LLC=%*pb weight=%u\n", + cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); + } + + /* + * Enable NUMA optimization only when there are multiple NUMA domains + * among the online CPUs and the NUMA domains don't perfectly overlaps + * with the LLC domains. + * + * If all CPUs belong to the same NUMA node and the same LLC domain, + * enabling both NUMA and LLC optimizations is unnecessary, as checking + * for an idle CPU in the same domain twice is redundant. + * + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA + * optimization, as we would naturally select idle CPUs within + * specific NUMA nodes querying the corresponding per-node cpumask. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + nr_cpus = numa_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) + enable_numa = true; + pr_debug("sched_ext: NUMA=%*pb weight=%u\n", + cpumask_pr_args(numa_span(cpu)), nr_cpus); + } + } + rcu_read_unlock(); + + pr_debug("sched_ext: LLC idle selection %s\n", + str_enabled_disabled(enable_llc)); + pr_debug("sched_ext: NUMA idle selection %s\n", + str_enabled_disabled(enable_numa)); + + if (enable_llc) + static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_llc); + if (enable_numa) + static_branch_enable_cpuslocked(&scx_selcpu_topo_numa); + else + static_branch_disable_cpuslocked(&scx_selcpu_topo_numa); +} + +/* + * Built-in CPU idle selection policy: + * + * 1. Prioritize full-idle cores: + * - always prioritize CPUs from fully idle cores (both logical CPUs are + * idle) to avoid interference caused by SMT. + * + * 2. Reuse the same CPU: + * - prefer the last used CPU to take advantage of cached data (L1, L2) and + * branch prediction optimizations. + * + * 3. Pick a CPU within the same LLC (Last-Level Cache): + * - if the above conditions aren't met, pick a CPU that shares the same LLC + * to maintain cache locality. + * + * 4. Pick a CPU within the same NUMA node, if enabled: + * - choose a CPU from the same NUMA node to reduce memory access latency. + * + * 5. Pick any idle CPU usable by the task. + * + * Step 3 and 4 are performed only if the system has, respectively, + * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and + * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs. + * + * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always + * begin in @prev_cpu's node and proceed to other nodes in order of + * increasing distance. + * + * Return the picked CPU if idle, or a negative value otherwise. + * + * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because + * we never call ops.select_cpu() for them, see select_task_rq(). + */ +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags) +{ + const struct cpumask *llc_cpus = NULL; + const struct cpumask *numa_cpus = NULL; + int node = scx_cpu_node_if_enabled(prev_cpu); + s32 cpu; + + /* + * This is necessary to protect llc_cpus. + */ + rcu_read_lock(); + + /* + * Determine the scheduling domain only if the task is allowed to run + * on all CPUs. + * + * This is done primarily for efficiency, as it avoids the overhead of + * updating a cpumask every time we need to select an idle CPU (which + * can be costly in large SMP systems), but it also aligns logically: + * if a task's scheduling domain is restricted by user-space (through + * CPU affinity), the task will simply use the flat scheduling domain + * defined by user-space. + */ + if (p->nr_cpus_allowed >= num_possible_cpus()) { + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) + numa_cpus = numa_span(prev_cpu); + + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) + llc_cpus = llc_span(prev_cpu); + } + + /* + * If WAKE_SYNC, try to migrate the wakee to the waker's CPU. + */ + if (wake_flags & SCX_WAKE_SYNC) { + int waker_node; + + /* + * If the waker's CPU is cache affine and prev_cpu is idle, + * then avoid a migration. + */ + cpu = smp_processor_id(); + if (cpus_share_cache(cpu, prev_cpu) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * If the waker's local DSQ is empty, and the system is under + * utilized, try to wake up @p to the local DSQ of the waker. + * + * Checking only for an empty local DSQ is insufficient as it + * could give the wakee an unfair advantage when the system is + * oversaturated. + * + * Checking only for the presence of idle CPUs is also + * insufficient as the local DSQ of the waker could have tasks + * piled up on it even if there is an idle core elsewhere on + * the system. + */ + waker_node = cpu_to_node(cpu); + if (!(current->flags & PF_EXITING) && + cpu_rq(cpu)->scx.local_dsq.nr == 0 && + (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && + !cpumask_empty(idle_cpumask(waker_node)->cpu)) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) + goto out_unlock; + } + } + + /* + * If CPU has SMT, any wholly idle CPU is likely a better pick than + * partially idle @prev_cpu. + */ + if (sched_smt_active()) { + /* + * Keep using @prev_cpu if it's part of a fully idle core. + */ + if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && + scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * Search for any fully idle core in the same LLC domain. + */ + if (llc_cpus) { + cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any fully idle core in the same NUMA node. + */ + if (numa_cpus) { + cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any full-idle core usable by the task. + * + * If the node-aware idle CPU selection policy is enabled + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always + * begin in prev_cpu's node and proceed to other nodes in + * order of increasing distance. + */ + cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE); + if (cpu >= 0) + goto out_unlock; + + /* + * Give up if we're strictly looking for a full-idle SMT + * core. + */ + if (flags & SCX_PICK_IDLE_CORE) { + cpu = prev_cpu; + goto out_unlock; + } + } + + /* + * Use @prev_cpu if it's idle. + */ + if (scx_idle_test_and_clear_cpu(prev_cpu)) { + cpu = prev_cpu; + goto out_unlock; + } + + /* + * Search for any idle CPU in the same LLC domain. + */ + if (llc_cpus) { + cpu = pick_idle_cpu_in_node(llc_cpus, node, 0); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any idle CPU in the same NUMA node. + */ + if (numa_cpus) { + cpu = pick_idle_cpu_in_node(numa_cpus, node, 0); + if (cpu >= 0) + goto out_unlock; + } + + /* + * Search for any idle CPU usable by the task. + * + * If the node-aware idle CPU selection policy is enabled + * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin + * in prev_cpu's node and proceed to other nodes in order of + * increasing distance. + */ + cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags); + if (cpu >= 0) + goto out_unlock; + +out_unlock: + rcu_read_unlock(); + + return cpu; +} + +/* + * Initialize global and per-node idle cpumasks. + */ +void scx_idle_init_masks(void) +{ + int node; + + /* Allocate global idle cpumasks */ + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL)); + + /* Allocate per-node idle cpumasks */ + scx_idle_node_masks = kcalloc(num_possible_nodes(), + sizeof(*scx_idle_node_masks), GFP_KERNEL); + BUG_ON(!scx_idle_node_masks); + + for_each_node(node) { + scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks), + GFP_KERNEL, node); + BUG_ON(!scx_idle_node_masks[node]); + + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node)); + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node)); + } +} + +static void update_builtin_idle(int cpu, bool idle) +{ + int node = scx_cpu_node_if_enabled(cpu); + struct cpumask *idle_cpus = idle_cpumask(node)->cpu; + + assign_cpu(cpu, idle_cpus, idle); + +#ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) { + const struct cpumask *smt = cpu_smt_mask(cpu); + struct cpumask *idle_smts = idle_cpumask(node)->smt; + + if (idle) { + /* + * idle_smt handling is racy but that's fine as it's + * only for optimization and self-correcting. + */ + if (!cpumask_subset(smt, idle_cpus)) + return; + cpumask_or(idle_smts, idle_smts, smt); + } else { + cpumask_andnot(idle_smts, idle_smts, smt); + } + } +#endif +} + +/* + * Update the idle state of a CPU to @idle. + * + * If @do_notify is true, ops.update_idle() is invoked to notify the scx + * scheduler of an actual idle state transition (idle to busy or vice + * versa). If @do_notify is false, only the idle state in the idle masks is + * refreshed without invoking ops.update_idle(). + * + * This distinction is necessary, because an idle CPU can be "reserved" and + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as + * busy even if no tasks are dispatched. In this case, the CPU may return + * to idle without a true state transition. Refreshing the idle masks + * without invoking ops.update_idle() ensures accurate idle state tracking + * while avoiding unnecessary updates and maintaining balanced state + * transitions. + */ +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + */ + if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + + /* + * Update the idle masks: + * - for real idle transitions (do_notify == true) + * - for idle-to-idle transitions (indicated by the previous task + * being the idle thread, managed by pick_task_idle()) + * + * Skip updating idle masks if the previous task is not the idle + * thread, since set_next_task_idle() has already handled it when + * transitioning from a task to the idle thread (calling this + * function with do_notify == true). + * + * In this way we can avoid updating the idle masks twice, + * unnecessarily. + */ + if (static_branch_likely(&scx_builtin_idle_enabled)) + if (do_notify || is_idle_task(rq->curr)) + update_builtin_idle(cpu, idle); +} + +static void reset_idle_masks(struct sched_ext_ops *ops) +{ + int node; + + /* + * Consider all online cpus idle. Should converge to the actual state + * quickly. + */ + if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) { + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask); + cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask); + return; + } + + for_each_node(node) { + const struct cpumask *node_mask = cpumask_of_node(node); + + cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask); + cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); + } +} +#endif /* CONFIG_SMP */ + +void scx_idle_enable(struct sched_ext_ops *ops) +{ + if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) + static_branch_enable(&scx_builtin_idle_enabled); + else + static_branch_disable(&scx_builtin_idle_enabled); + + if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) + static_branch_enable(&scx_builtin_idle_per_node); + else + static_branch_disable(&scx_builtin_idle_per_node); + +#ifdef CONFIG_SMP + reset_idle_masks(ops); +#endif +} + +void scx_idle_disable(void) +{ + static_branch_disable(&scx_builtin_idle_enabled); + static_branch_disable(&scx_builtin_idle_per_node); +} + +/******************************************************************************** + * Helpers that can be called from the BPF scheduler. + */ + +static int validate_node(int node) +{ + if (!static_branch_likely(&scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is disabled"); + return -EOPNOTSUPP; + } + + /* Return no entry for NUMA_NO_NODE (not a critical scx error) */ + if (node == NUMA_NO_NODE) + return -ENOENT; + + /* Make sure node is in a valid range */ + if (node < 0 || node >= nr_node_ids) { + scx_ops_error("invalid node %d", node); + return -EINVAL; + } + + /* Make sure the node is part of the set of possible nodes */ + if (!node_possible(node)) { + scx_ops_error("unavailable node %d", node); + return -EINVAL; + } + + return node; +} + +__bpf_kfunc_start_defs(); + +static bool check_builtin_idle_enabled(void) +{ + if (static_branch_likely(&scx_builtin_idle_enabled)) + return true; + + scx_ops_error("built-in idle tracking is disabled"); + return false; +} + +/** + * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or + * trigger an error if @cpu is invalid + * @cpu: target CPU + */ +__bpf_kfunc int scx_bpf_cpu_node(s32 cpu) +{ +#ifdef CONFIG_NUMA + if (!ops_cpu_valid(cpu, NULL)) + return NUMA_NO_NODE; + + return cpu_to_node(cpu); +#else + return 0; +#endif +} + +/** + * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @is_idle: out parameter indicating whether the returned CPU is idle + * + * Can only be called from ops.select_cpu() if the built-in CPU selection is + * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. + * @p, @prev_cpu and @wake_flags match ops.select_cpu(). + * + * Returns the picked CPU with *@is_idle indicating whether the picked CPU is + * currently idle and thus a good candidate for direct dispatching. + */ +__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags, bool *is_idle) +{ +#ifdef CONFIG_SMP + s32 cpu; +#endif + if (!ops_cpu_valid(prev_cpu, NULL)) + goto prev_cpu; + + if (!check_builtin_idle_enabled()) + goto prev_cpu; + + if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) + goto prev_cpu; + +#ifdef CONFIG_SMP + cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0); + if (cpu >= 0) { + *is_idle = true; + return cpu; + } +#endif + +prev_cpu: + *is_idle = false; + return prev_cpu; +} + +/** + * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the + * idle-tracking per-CPU cpumask of a target NUMA node. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_ops_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking + * per-CPU cpumask. + * + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) +{ + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + return idle_cpumask(NUMA_NO_NODE)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the + * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be + * used to determine if an entire physical core is free. + * @node: target NUMA node + * + * Returns an empty cpumask if idle tracking is not enabled, if @node is + * not valid, or running on a UP kernel. In this case the actual error will + * be reported to the BPF scheduler via scx_ops_error(). + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) +{ + node = validate_node(node); + if (node < 0) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(node)->smt; + else + return idle_cpumask(node)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, + * per-physical-core cpumask. Can be used to determine if an entire physical + * core is free. + * + * Returns an empty mask if idle tracking is not enabled, or running on a + * UP kernel. + */ +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) +{ + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { + scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + return cpu_none_mask; + } + + if (!check_builtin_idle_enabled()) + return cpu_none_mask; + +#ifdef CONFIG_SMP + if (sched_smt_active()) + return idle_cpumask(NUMA_NO_NODE)->smt; + else + return idle_cpumask(NUMA_NO_NODE)->cpu; +#else + return cpu_none_mask; +#endif +} + +/** + * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to + * either the percpu, or SMT idle-tracking cpumask. + * @idle_mask: &cpumask to use + */ +__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) +{ + /* + * Empty function body because we aren't actually acquiring or releasing + * a reference to a global idle cpumask, which is read-only in the + * caller and is never released. The acquire / release semantics here + * are just used to make the cpumask a trusted pointer in the caller. + */ +} + +/** + * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state + * @cpu: cpu to test and clear idle for + * + * Returns %true if @cpu was idle and its idle state was successfully cleared. + * %false otherwise. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + */ +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +{ + if (!check_builtin_idle_enabled()) + return false; + + if (ops_cpu_valid(cpu, NULL)) + return scx_idle_test_and_clear_cpu(cpu); + else + return false; +} + +/** + * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_* flags + * + * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node. + * + * Returns the picked idle cpu number on success, or -%EBUSY if no matching + * cpu was found. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node). + * + * Always returns an error if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if + * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + node = validate_node(node); + if (node < 0) + return node; + + return scx_pick_idle_cpu(cpus_allowed, node, flags); +} + +/** + * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu + * number on success. -%EBUSY if no matching cpu was found. + * + * Idle CPU tracking may race against CPU scheduling state transitions. For + * example, this function may return -%EBUSY as CPUs are transitioning into the + * idle state. If the caller then assumes that there will be dispatch events on + * the CPUs as they were all busy, the scheduler may end up stalling with CPUs + * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and + * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch + * event in the near future. + * + * Unavailable if ops.update_idle() is implemented and + * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_idle_cpu_node() instead. + */ +__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is enabled"); + return -EBUSY; + } + + if (!check_builtin_idle_enabled()) + return -EBUSY; + + return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); +} + +/** + * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available + * or pick any CPU from @node + * @cpus_allowed: Allowed cpumask + * @node: target NUMA node + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * The search starts from @node and proceeds to other online NUMA nodes in + * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified, + * in which case the search is limited to the target @node, regardless of + * the CPU idle state). + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, + int node, u64 flags) +{ + s32 cpu; + + node = validate_node(node); + if (node < 0) + return node; + + cpu = scx_pick_idle_cpu(cpus_allowed, node, flags); + if (cpu >= 0) + return cpu; + + if (flags & SCX_PICK_IDLE_IN_NODE) + cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed); + else + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +/** + * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU + * @cpus_allowed: Allowed cpumask + * @flags: %SCX_PICK_IDLE_CPU_* flags + * + * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any + * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu + * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is + * empty. + * + * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not + * set, this function can't tell which CPUs are idle and will always pick any + * CPU. + * + * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use + * scx_bpf_pick_any_cpu_node() instead. + */ +__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, + u64 flags) +{ + s32 cpu; + + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { + scx_ops_error("per-node idle tracking is enabled"); + return -EBUSY; + } + + if (static_branch_likely(&scx_builtin_idle_enabled)) { + cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); + if (cpu >= 0) + return cpu; + } + + cpu = cpumask_any_distribute(cpus_allowed); + if (cpu < nr_cpu_ids) + return cpu; + else + return -EBUSY; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(scx_kfunc_ids_idle) +BTF_ID_FLAGS(func, scx_bpf_cpu_node) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_idle) + +static const struct btf_kfunc_id_set scx_kfunc_set_idle = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_idle, +}; + +BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) + +static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_select_cpu, +}; + +int scx_idle_init(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); + + return ret; +} diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h new file mode 100644 index 000000000000..511cc2221f7a --- /dev/null +++ b/kernel/sched/ext_idle.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com> + */ +#ifndef _KERNEL_SCHED_EXT_IDLE_H +#define _KERNEL_SCHED_EXT_IDLE_H + +struct sched_ext_ops; + +#ifdef CONFIG_SMP +void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); +void scx_idle_init_masks(void); +bool scx_idle_test_and_clear_cpu(int cpu); +s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags); +#else /* !CONFIG_SMP */ +static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} +static inline void scx_idle_init_masks(void) {} +static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } +static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + return -EBUSY; +} +#endif /* CONFIG_SMP */ + +s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags); +void scx_idle_enable(struct sched_ext_ops *ops); +void scx_idle_disable(void); +int scx_idle_init(void); + +#endif /* _KERNEL_SCHED_EXT_IDLE_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26958431deb7..e43993a4e580 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -37,6 +37,7 @@ #include <linux/sched/cputime.h> #include <linux/sched/isolation.h> #include <linux/sched/nohz.h> +#include <linux/sched/prio.h> #include <linux/cpuidle.h> #include <linux/interrupt.h> @@ -51,6 +52,8 @@ #include <asm/switch_to.h> +#include <uapi/linux/sched/types.h> + #include "sched.h" #include "stats.h" #include "autogroup.h" @@ -71,12 +74,12 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_base_slice = 750000ULL; -static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; +unsigned int sysctl_sched_base_slice = 700000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; -const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; static int __init setup_sched_thermal_decay_shift(char *str) { @@ -130,7 +133,7 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #endif #ifdef CONFIG_SYSCTL -static struct ctl_table sched_fair_sysctls[] = { +static const struct ctl_table sched_fair_sysctls[] = { #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", @@ -396,7 +399,7 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) static inline void assert_list_leaf_cfs_rq(struct rq *rq) { - SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); + WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); } /* Iterate through all leaf cfs_rq's on a runqueue */ @@ -523,7 +526,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); * Scheduling class tree data structure manipulation methods: */ -static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) +static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - max_vruntime); if (delta > 0) @@ -532,7 +535,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) return max_vruntime; } -static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - min_vruntime); if (delta < 0) @@ -693,7 +696,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { s64 vlag, limit; - SCHED_WARN_ON(!se->on_rq); + WARN_ON_ONCE(!se->on_rq); vlag = avg_vruntime(cfs_rq) - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); @@ -881,6 +884,26 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) } /* + * HACK, stash a copy of deadline at the point of pick in vlag, + * which isn't used until dequeue. + */ +static inline void set_protect_slice(struct sched_entity *se) +{ + se->vlag = se->deadline; +} + +static inline bool protect_slice(struct sched_entity *se) +{ + return se->vlag == se->deadline; +} + +static inline void cancel_protect_slice(struct sched_entity *se) +{ + if (protect_slice(se)) + se->vlag = se->deadline + 1; +} + +/* * Earliest Eligible Virtual Deadline First * * In order to provide latency guarantees for different request sizes @@ -910,17 +933,13 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) * We can safely skip eligibility check if there is only one entity * in this cfs_rq, saving some cycles. */ - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_queued == 1) return curr && curr->on_rq ? curr : se; if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - /* - * Once selected, run a task until it either becomes non-eligible or - * until it gets a new slice. See the HACK in set_next_entity(). - */ - if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) + if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -964,7 +983,6 @@ found: return best; } -#ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -991,7 +1009,6 @@ int sched_update_scaling(void) return 0; } #endif -#endif static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1245,7 +1262,7 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_queued == 1) return; if (resched || did_preempt_short(cfs_rq, curr)) { @@ -2126,7 +2143,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->load += cpu_load(rq); ns->runnable += cpu_runnable(rq); ns->util += cpu_util_cfs(cpu); - ns->nr_running += rq->cfs.h_nr_running; + ns->nr_running += rq->cfs.h_nr_runnable; ns->compute_capacity += capacity_of(cpu); if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { @@ -3298,7 +3315,7 @@ static void task_numa_work(struct callback_head *work) bool vma_pids_skipped; bool vma_pids_forced = false; - SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); work->next = work; /* @@ -3677,9 +3694,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &rq->cfs_tasks); } #endif - cfs_rq->nr_running++; - if (se_is_idle(se)) - cfs_rq->idle_nr_running++; + cfs_rq->nr_queued++; } static void @@ -3692,9 +3707,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } #endif - cfs_rq->nr_running--; - if (se_is_idle(se)) - cfs_rq->idle_nr_running--; + cfs_rq->nr_queued--; } /* @@ -4021,7 +4034,7 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa) * Make sure that rounding and/or propagation of PELT values never * break this. */ - SCHED_WARN_ON(sa->load_avg || + WARN_ON_ONCE(sa->load_avg || sa->util_avg || sa->runnable_avg); @@ -4046,15 +4059,17 @@ static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) { struct cfs_rq *prev_cfs_rq; struct list_head *prev; + struct rq *rq = rq_of(cfs_rq); if (cfs_rq->on_list) { prev = cfs_rq->leaf_cfs_rq_list.prev; } else { - struct rq *rq = rq_of(cfs_rq); - prev = rq->tmp_alone_branch; } + if (prev == &rq->leaf_cfs_rq_list) + return false; + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); return (prev_cfs_rq->tg->parent == cfs_rq->tg); @@ -5128,7 +5143,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { - return !cfs_rq->nr_running; + return !cfs_rq->nr_queued; } #define UPDATE_TG 0x0 @@ -5166,6 +5181,22 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ +void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_entity *se = &p->se; + + p->static_prio = NICE_TO_PRIO(attr->sched_nice); + if (attr->sched_runtime) { + se->custom_slice = 1; + se->slice = clamp_t(u64, attr->sched_runtime, + NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ + NSEC_PER_MSEC*100); /* HZ=100 / 10 */ + } else { + se->custom_slice = 0; + se->slice = sysctl_sched_base_slice; + } +} + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -5184,7 +5215,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ - if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) { + if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; @@ -5277,8 +5308,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); -static inline bool cfs_bandwidth_used(void); - static void requeue_delayed_entity(struct sched_entity *se); @@ -5300,7 +5329,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_running of its group cfs_rq. + * h_nr_runnable of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight @@ -5333,7 +5362,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - if (cfs_rq->nr_running == 1) { + if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); if (!throttled_hierarchy(cfs_rq)) { list_add_leaf_cfs_rq(cfs_rq); @@ -5372,10 +5401,19 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void set_delayed(struct sched_entity *se) { se->sched_delayed = 1; + + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since dequeue_entities() + * will account it for blocked tasks. + */ + if (!entity_is_task(se)) + return; + for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - cfs_rq->h_nr_delayed++; + cfs_rq->h_nr_runnable--; if (cfs_rq_throttled(cfs_rq)) break; } @@ -5384,10 +5422,20 @@ static void set_delayed(struct sched_entity *se) static void clear_delayed(struct sched_entity *se) { se->sched_delayed = 0; + + /* + * Delayed se of cfs_rq have no tasks queued on them. + * Do not adjust h_nr_runnable since a dequeue has + * already accounted for it or an enqueue of a task + * below it will account for it in enqueue_task_fair(). + */ + if (!entity_is_task(se)) + return; + for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - cfs_rq->h_nr_delayed--; + cfs_rq->h_nr_runnable++; if (cfs_rq_throttled(cfs_rq)) break; } @@ -5404,12 +5452,13 @@ static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { bool sleep = flags & DEQUEUE_SLEEP; + int action = UPDATE_TG; update_curr(cfs_rq); clear_buddies(cfs_rq, se); if (flags & DEQUEUE_DELAYED) { - SCHED_WARN_ON(!se->sched_delayed); + WARN_ON_ONCE(!se->sched_delayed); } else { bool delay = sleep; /* @@ -5419,7 +5468,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_SPECIAL) delay = false; - SCHED_WARN_ON(delay && se->sched_delayed); + WARN_ON_ONCE(delay && se->sched_delayed); if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { @@ -5429,7 +5478,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } } - int action = UPDATE_TG; if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) action |= DO_DETACH; @@ -5437,7 +5485,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_running of its group cfs_rq. + * h_nr_runnable of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. @@ -5475,7 +5523,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); - if (cfs_rq->nr_running == 0) + if (cfs_rq->nr_queued == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); return true; @@ -5496,15 +5544,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - /* - * HACK, stash a copy of deadline at the point of pick in vlag, - * which isn't used until dequeue. - */ - se->vlag = se->deadline; + + set_protect_slice(se); } update_stats_curr_start(cfs_rq, se); - SCHED_WARN_ON(cfs_rq->curr); + WARN_ON_ONCE(cfs_rq->curr); cfs_rq->curr = se; /* @@ -5537,17 +5582,19 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); static struct sched_entity * pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { + struct sched_entity *se; + /* - * Enabling NEXT_BUDDY will affect latency but not fairness. + * Picking the ->next buddy will affect latency but not fairness. */ - if (sched_feat(NEXT_BUDDY) && + if (sched_feat(PICK_BUDDY) && cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { /* ->next will never be delayed */ - SCHED_WARN_ON(cfs_rq->next->sched_delayed); + WARN_ON_ONCE(cfs_rq->next->sched_delayed); return cfs_rq->next; } - struct sched_entity *se = pick_eevdf(cfs_rq); + se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); /* @@ -5579,7 +5626,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* in !on_rq case, update occurred at dequeue */ update_load_avg(cfs_rq, prev, 0); } - SCHED_WARN_ON(cfs_rq->curr != prev); + WARN_ON_ONCE(cfs_rq->curr != prev); cfs_rq->curr = NULL; } @@ -5802,7 +5849,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttled_clock_self = 0; - if (SCHED_WARN_ON((s64)delta < 0)) + if (WARN_ON_ONCE((s64)delta < 0)) delta = 0; cfs_rq->throttled_clock_self_time += delta; @@ -5822,8 +5869,8 @@ static int tg_throttle_down(struct task_group *tg, void *data) cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); - SCHED_WARN_ON(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_running) + WARN_ON_ONCE(cfs_rq->throttled_clock_self); + if (cfs_rq->nr_queued) cfs_rq->throttled_clock_self = rq_clock(rq); } cfs_rq->throttle_count++; @@ -5836,8 +5883,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, delayed_delta, dequeue = 1; - long rq_h_nr_running = rq->cfs.h_nr_running; + long queued_delta, runnable_delta, idle_delta, dequeue = 1; + long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5867,9 +5914,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; - delayed_delta = cfs_rq->h_nr_delayed; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); int flags; @@ -5889,11 +5936,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue_entity(qcfs_rq, se, flags); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; - qcfs_rq->h_nr_delayed -= delayed_delta; + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; + qcfs_rq->h_nr_idle -= idle_delta; if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5912,18 +5959,18 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; - qcfs_rq->h_nr_delayed -= delayed_delta; + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; + qcfs_rq->h_nr_idle -= idle_delta; } /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, task_delta); + sub_nr_running(rq, queued_delta); /* Stop the fair server if throttling resulted in no runnable tasks */ - if (rq_h_nr_running && !rq->cfs.h_nr_running) + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) dl_server_stop(&rq->fair_server); done: /* @@ -5931,8 +5978,8 @@ done: * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; - SCHED_WARN_ON(cfs_rq->throttled_clock); - if (cfs_rq->nr_running) + WARN_ON_ONCE(cfs_rq->throttled_clock); + if (cfs_rq->nr_queued) cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -5942,8 +5989,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, delayed_delta; - long rq_h_nr_running = rq->cfs.h_nr_running; + long queued_delta, runnable_delta, idle_delta; + long rq_h_nr_queued = rq->cfs.h_nr_queued; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -5976,9 +6023,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) goto unthrottle_throttle; } - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; - delayed_delta = cfs_rq->h_nr_delayed; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); @@ -5992,11 +6039,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running += task_delta; - qcfs_rq->idle_h_nr_running += idle_task_delta; - qcfs_rq->h_nr_delayed += delayed_delta; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6010,11 +6057,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running += task_delta; - qcfs_rq->idle_h_nr_running += idle_task_delta; - qcfs_rq->h_nr_delayed += delayed_delta; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6022,17 +6069,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } /* Start the fair server if un-throttling resulted in new runnable tasks */ - if (!rq_h_nr_running && rq->cfs.h_nr_running) + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, task_delta); + add_nr_running(rq, queued_delta); unthrottle_throttle: assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ - if (rq->curr == rq->idle && rq->cfs.nr_running) + if (rq->curr == rq->idle && rq->cfs.nr_queued) resched_curr(rq); } @@ -6087,7 +6134,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) } /* Already enqueued */ - if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) + if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list))) return; first = list_empty(&rq->cfsb_csd_list); @@ -6106,7 +6153,7 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { lockdep_assert_rq_held(rq_of(cfs_rq)); - if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || + if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) || cfs_rq->runtime_remaining <= 0)) return; @@ -6142,7 +6189,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) goto next; /* By the above checks, this should never be true */ - SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + WARN_ON_ONCE(cfs_rq->runtime_remaining > 0); raw_spin_lock(&cfs_b->lock); runtime = -cfs_rq->runtime_remaining + 1; @@ -6163,7 +6210,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) * We currently only expect to be unthrottling * a single cfs_rq locally. */ - SCHED_WARN_ON(!list_empty(&local_unthrottle)); + WARN_ON_ONCE(!list_empty(&local_unthrottle)); list_add_tail(&cfs_rq->throttled_csd_list, &local_unthrottle); } @@ -6188,7 +6235,7 @@ next: rq_unlock_irqrestore(rq, &rf); } - SCHED_WARN_ON(!list_empty(&local_unthrottle)); + WARN_ON_ONCE(!list_empty(&local_unthrottle)); rcu_read_unlock(); @@ -6333,7 +6380,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued) return; __return_cfs_rq_runtime(cfs_rq); @@ -6505,14 +6552,14 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); - cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); /* Add a random offset so that timers interleave */ hrtimer_set_expires(&cfs_b->period_timer, get_random_u32_below(cfs_b->period)); - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->slack_timer.function = sched_cfs_slack_timer; + hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); cfs_b->slack_started = false; } @@ -6604,6 +6651,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) lockdep_assert_rq_held(rq); + // Do not unthrottle for an active CPU + if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask)) + return; + /* * The rq clock has already been updated in the * set_rq_offline(), so we should skip updating @@ -6619,18 +6670,20 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) continue; /* - * clock_task is not advancing so we just need to make sure - * there's some valid quota amount - */ - cfs_rq->runtime_remaining = 1; - /* * Offline rq is schedulable till CPU is completely disabled * in take_cpu_down(), so we prevent new cfs throttling here. */ cfs_rq->runtime_enabled = 0; - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); + if (!cfs_rq_throttled(cfs_rq)) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = 1; + unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); @@ -6679,11 +6732,6 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) #else /* CONFIG_CFS_BANDWIDTH */ -static inline bool cfs_bandwidth_used(void) -{ - return false; -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} @@ -6739,9 +6787,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - SCHED_WARN_ON(task_rq(p) != rq); + WARN_ON_ONCE(task_rq(p) != rq); - if (rq->cfs.h_nr_running > 1) { + if (rq->cfs.h_nr_queued > 1) { u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; u64 slice = se->slice; s64 delta = slice - ran; @@ -6829,7 +6877,7 @@ static inline void check_update_overutilized_status(struct rq *rq) { } /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) { - return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + return unlikely(rq->nr_running == rq->cfs.h_nr_idle && rq->nr_running); } @@ -6850,20 +6898,20 @@ requeue_delayed_entity(struct sched_entity *se) * Because a delayed entity is one that is still on * the runqueue competing until elegibility. */ - SCHED_WARN_ON(!se->sched_delayed); - SCHED_WARN_ON(!se->on_rq); + WARN_ON_ONCE(!se->sched_delayed); + WARN_ON_ONCE(!se->on_rq); if (sched_feat(DELAY_ZERO)) { update_entity_lag(cfs_rq, se); if (se->vlag > 0) { - cfs_rq->nr_running--; + cfs_rq->nr_queued--; if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->vlag = 0; place_entity(cfs_rq, se, 0); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); - cfs_rq->nr_running++; + cfs_rq->nr_queued++; } } @@ -6881,10 +6929,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int idle_h_nr_running = task_has_idle_policy(p); - int h_nr_delayed = 0; + int h_nr_idle = task_has_idle_policy(p); + int h_nr_runnable = 1; int task_new = !(flags & ENQUEUE_WAKEUP); - int rq_h_nr_running = rq->cfs.h_nr_running; + int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; /* @@ -6909,8 +6957,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); - if (task_new) - h_nr_delayed = !!se->sched_delayed; + if (task_new && se->sched_delayed) + h_nr_runnable = 0; for_each_sched_entity(se) { if (se->on_rq) { @@ -6932,12 +6980,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) enqueue_entity(cfs_rq, se, flags); slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; - cfs_rq->h_nr_delayed += h_nr_delayed; + cfs_rq->h_nr_runnable += h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle += h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -6954,21 +7002,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_group(se); se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; - cfs_rq->h_nr_delayed += h_nr_delayed; + cfs_rq->h_nr_runnable += h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle += h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; } - if (!rq_h_nr_running && rq->cfs.h_nr_running) { + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { /* Account for idle runtime */ if (!rq->nr_running) dl_server_update_idle_time(rq, rq->curr); @@ -7015,22 +7065,22 @@ static void set_next_buddy(struct sched_entity *se); static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { bool was_sched_idle = sched_idle_rq(rq); - int rq_h_nr_running = rq->cfs.h_nr_running; + int rq_h_nr_queued = rq->cfs.h_nr_queued; bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; struct task_struct *p = NULL; - int idle_h_nr_running = 0; - int h_nr_running = 0; - int h_nr_delayed = 0; + int h_nr_idle = 0; + int h_nr_queued = 0; + int h_nr_runnable = 0; struct cfs_rq *cfs_rq; u64 slice = 0; if (entity_is_task(se)) { p = task_of(se); - h_nr_running = 1; - idle_h_nr_running = task_has_idle_policy(p); - if (!task_sleep && !task_delayed) - h_nr_delayed = !!se->sched_delayed; + h_nr_queued = 1; + h_nr_idle = task_has_idle_policy(p); + if (task_sleep || task_delayed || !se->sched_delayed) + h_nr_runnable = 1; } else { cfs_rq = group_cfs_rq(se); slice = cfs_rq_min_slice(cfs_rq); @@ -7046,12 +7096,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) break; } - cfs_rq->h_nr_running -= h_nr_running; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; - cfs_rq->h_nr_delayed -= h_nr_delayed; + cfs_rq->h_nr_runnable -= h_nr_runnable; + cfs_rq->h_nr_queued -= h_nr_queued; + cfs_rq->h_nr_idle -= h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = h_nr_running; + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -7083,23 +7133,25 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) update_cfs_group(se); se->slice = slice; + if (se != cfs_rq->curr) + min_vruntime_cb_propagate(&se->run_node, NULL); slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running -= h_nr_running; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; - cfs_rq->h_nr_delayed -= h_nr_delayed; + cfs_rq->h_nr_runnable -= h_nr_runnable; + cfs_rq->h_nr_queued -= h_nr_queued; + cfs_rq->h_nr_idle -= h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = h_nr_running; + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) return 0; } - sub_nr_running(rq, h_nr_running); + sub_nr_running(rq, h_nr_queued); - if (rq_h_nr_running && !rq->cfs.h_nr_running) + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) dl_server_stop(&rq->fair_server); /* balance early to pull high priority tasks */ @@ -7107,8 +7159,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) rq->next_balance = jiffies; if (p && task_delayed) { - SCHED_WARN_ON(!task_sleep); - SCHED_WARN_ON(p->on_rq != 1); + WARN_ON_ONCE(!task_sleep); + WARN_ON_ONCE(p->on_rq != 1); /* Fix-up what dequeue_task_fair() skipped */ hrtick_update(rq); @@ -8686,7 +8738,7 @@ static inline void set_task_max_allowed_capacity(struct task_struct *p) {} static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { - if (SCHED_WARN_ON(!se->on_rq)) + if (WARN_ON_ONCE(!se->on_rq)) return; if (se_is_idle(se)) return; @@ -8746,8 +8798,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Preempt an idle entity in favor of a non-idle entity (and don't preempt * in the inverse case). */ - if (cse_is_idle && !pse_is_idle) + if (cse_is_idle && !pse_is_idle) { + /* + * When non-idle entity preempt an idle entity, + * don't give idle entity slice protection. + */ + cancel_protect_slice(se); goto preempt; + } + if (cse_is_idle != pse_is_idle) return; @@ -8766,8 +8825,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * Note that even if @p does not turn out to be the most eligible * task at this moment, current's slice protection will be lost. */ - if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) - se->vlag = se->deadline + 1; + if (do_preempt_short(cfs_rq, pse, se)) + cancel_protect_slice(se); /* * If @p has become the most eligible task, force preemption. @@ -8788,7 +8847,7 @@ static struct task_struct *pick_task_fair(struct rq *rq) again: cfs_rq = &rq->cfs; - if (!cfs_rq->nr_running) + if (!cfs_rq->nr_queued) return NULL; do { @@ -8905,7 +8964,7 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) { - return !!dl_se->rq->cfs.nr_running; + return !!dl_se->rq->cfs.nr_queued; } static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) @@ -9236,43 +9295,43 @@ static int task_hot(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_NUMA_BALANCING /* - * Returns 1, if task migration degrades locality - * Returns 0, if task migration improves locality i.e migration preferred. - * Returns -1, if task migration is not affected by locality. + * Returns a positive value, if task migration degrades locality. + * Returns 0, if task migration is not affected by locality. + * Returns a negative value, if task migration improves locality i.e migration preferred. */ -static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); unsigned long src_weight, dst_weight; int src_nid, dst_nid, dist; if (!static_branch_likely(&sched_numa_balancing)) - return -1; + return 0; if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return -1; + return 0; src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); if (src_nid == dst_nid) - return -1; + return 0; /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) { if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) return 1; else - return -1; + return 0; } /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) - return 0; + return -1; /* Leaving a core idle is often worse than degrading locality. */ if (env->idle == CPU_IDLE) - return -1; + return 0; dist = node_distance(src_nid, dst_nid); if (numa_group) { @@ -9283,37 +9342,77 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) dst_weight = task_weight(p, dst_nid, dist); } - return dst_weight < src_weight; + return src_weight - dst_weight; } #else -static inline int migrate_degrades_locality(struct task_struct *p, +static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - return -1; + return 0; } #endif /* + * Check whether the task is ineligible on the destination cpu + * + * When the PLACE_LAG scheduling feature is enabled and + * dst_cfs_rq->nr_queued is greater than 1, if the task + * is ineligible, it will also be ineligible when + * it is migrated to the destination cpu. + */ +static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu) +{ + struct cfs_rq *dst_cfs_rq; + +#ifdef CONFIG_FAIR_GROUP_SCHED + dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu]; +#else + dst_cfs_rq = &cpu_rq(dest_cpu)->cfs; +#endif + if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued && + !entity_eligible(task_cfs_rq(p), &p->se)) + return 1; + + return 0; +} + +/* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot; + long degrades, hot; lockdep_assert_rq_held(env->src_rq); + if (p->sched_task_hot) + p->sched_task_hot = 0; /* * We do not migrate tasks that are: - * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_ptr, or - * 3) running (obviously), or - * 4) are cache-hot on their current CPU. + * 1) delayed dequeued unless we migrate load, or + * 2) throttled_lb_pair, or + * 3) cannot be migrated to this CPU due to cpus_ptr, or + * 4) running (obviously), or + * 5) are cache-hot on their current CPU. */ + if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) + return 0; + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; + /* + * We want to prioritize the migration of eligible tasks. + * For ineligible tasks we soft-limit them and only allow + * them to migrate when nr_balance_failed is non-zero to + * avoid load-balancing trying very hard to balance the load. + */ + if (!env->sd->nr_balance_failed && + task_is_ineligible_on_dst_cpu(p, env->dst_cpu)) + return 0; + /* Disregard percpu kthreads; they are where they need to be. */ if (kthread_is_per_cpu(p)) return 0; @@ -9340,12 +9439,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; /* Prevent to re-select dst_cpu via env's CPUs: */ - for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) { - env->flags |= LBF_DST_PINNED; - env->new_dst_cpu = cpu; - break; - } + cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr); + + if (cpu < nr_cpu_ids) { + env->flags |= LBF_DST_PINNED; + env->new_dst_cpu = cpu; } return 0; @@ -9369,16 +9467,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (env->flags & LBF_ACTIVE_LB) return 1; - tsk_cache_hot = migrate_degrades_locality(p, env); - if (tsk_cache_hot == -1) - tsk_cache_hot = task_hot(p, env); + degrades = migrate_degrades_locality(p, env); + if (!degrades) + hot = task_hot(p, env); + else + hot = degrades > 0; - if (tsk_cache_hot <= 0 || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot == 1) { - schedstat_inc(env->sd->lb_hot_gained[env->idle]); - schedstat_inc(p->stats.nr_forced_migrations); - } + if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { + if (hot) + p->sched_task_hot = 1; return 1; } @@ -9393,6 +9490,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_rq_held(env->src_rq); + if (p->sched_task_hot) { + p->sched_task_hot = 0; + schedstat_inc(env->sd->lb_hot_gained[env->idle]); + schedstat_inc(p->stats.nr_forced_migrations); + } + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -9553,6 +9656,9 @@ static int detach_tasks(struct lb_env *env) continue; next: + if (p->sched_task_hot) + schedstat_inc(p->stats.nr_failed_migrations_hot); + list_move(&p->se.group_node, tasks); } @@ -9695,7 +9801,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { update_tg_load_avg(cfs_rq); - if (cfs_rq->nr_running == 0) + if (cfs_rq->nr_queued == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); if (cfs_rq == &rq->cfs) @@ -10227,7 +10333,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * When there is more than 1 task, the group_overloaded case already * takes care of cpu with reduced capacity */ - if (rq->cfs.h_nr_running != 1) + if (rq->cfs.h_nr_runnable != 1) return false; return check_cpu_capacity(rq, sd); @@ -10249,7 +10355,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, bool *sg_overloaded, bool *sg_overutilized) { - int i, nr_running, local_group; + int i, nr_running, local_group, sd_flags = env->sd->flags; + bool balancing_at_rd = !env->sd->parent; memset(sgs, 0, sizeof(*sgs)); @@ -10262,21 +10369,14 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += load; sgs->group_util += cpu_util_cfs(i); sgs->group_runnable += cpu_runnable(rq); - sgs->sum_h_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable; nr_running = rq->nr_running; sgs->sum_nr_running += nr_running; - if (nr_running > 1) - *sg_overloaded = 1; - if (cpu_overutilized(i)) *sg_overutilized = 1; -#ifdef CONFIG_NUMA_BALANCING - sgs->nr_numa_running += rq->nr_numa_running; - sgs->nr_preferred_running += rq->nr_preferred_running; -#endif /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -10286,10 +10386,21 @@ static inline void update_sg_lb_stats(struct lb_env *env, continue; } + /* Overload indicator is only updated at root domain */ + if (balancing_at_rd && nr_running > 1) + *sg_overloaded = 1; + +#ifdef CONFIG_NUMA_BALANCING + /* Only fbq_classify_group() uses this to classify NUMA groups */ + if (sd_flags & SD_NUMA) { + sgs->nr_numa_running += rq->nr_numa_running; + sgs->nr_preferred_running += rq->nr_preferred_running; + } +#endif if (local_group) continue; - if (env->sd->flags & SD_ASYM_CPUCAPACITY) { + if (sd_flags & SD_ASYM_CPUCAPACITY) { /* Check for a misfit task on the cpu */ if (sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; @@ -10577,7 +10688,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_util += cpu_util_without(i, p); sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); - sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local; nr_running = rq->nr_running - local; sgs->sum_nr_running += nr_running; @@ -11359,7 +11470,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, if (rt > env->fbq_type) continue; - nr_running = rq->cfs.h_nr_running; + nr_running = rq->cfs.h_nr_runnable; if (!nr_running) continue; @@ -11518,7 +11629,7 @@ static int need_active_balance(struct lb_env *env) * available on dst_cpu. */ if (env->idle && - (env->src_rq->cfs.h_nr_running == 1)) { + (env->src_rq->cfs.h_nr_runnable == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) return 1; @@ -11598,6 +11709,28 @@ static int should_we_balance(struct lb_env *env) return group_balance_cpu(sg) == env->dst_cpu; } +static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd, + enum cpu_idle_type idle) +{ + if (!schedstat_enabled()) + return; + + switch (env->migration_type) { + case migrate_load: + __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance); + break; + case migrate_util: + __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance); + break; + case migrate_task: + __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance); + break; + case migrate_misfit: + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; + } +} + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -11648,7 +11781,7 @@ redo: WARN_ON_ONCE(busiest == env.dst_rq); - schedstat_add(sd->lb_imbalance[idle], env.imbalance); + update_lb_imbalance_stat(&env, sd, idle); env.src_cpu = busiest->cpu; env.src_rq = busiest; @@ -12146,16 +12279,13 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set - * anywhere yet. */ static inline int find_new_ilb(void) { const struct cpumask *hk_mask; int ilb_cpu; - hk_mask = housekeeping_cpumask(HK_TYPE_MISC); + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { @@ -12173,7 +12303,8 @@ static inline int find_new_ilb(void) * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU * SMP function call (IPI). * - * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). + * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set + * (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -12261,7 +12392,7 @@ static void nohz_balancer_kick(struct rq *rq) * If there's a runnable CFS task and the current CPU has reduced * capacity, kick the ILB to see if there's a better CPU to run on: */ - if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { + if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -12351,7 +12482,7 @@ unlock: void nohz_balance_exit_idle(struct rq *rq) { - SCHED_WARN_ON(rq != this_rq()); + WARN_ON_ONCE(rq != this_rq()); if (likely(!rq->nohz_tick_stopped)) return; @@ -12387,16 +12518,12 @@ void nohz_balance_enter_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - SCHED_WARN_ON(cpu != smp_processor_id()); + WARN_ON_ONCE(cpu != smp_processor_id()); /* If this CPU is going down, then nothing needs to be done: */ if (!cpu_active(cpu)) return; - /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) - return; - /* * Can be set safely without rq->lock held * If a clear happens, it will have evaluated last additions because @@ -12474,7 +12601,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) int balance_cpu; struct rq *rq; - SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); + WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); /* * We assume there will be no idle load after this update and clear @@ -12616,13 +12743,6 @@ static void nohz_newidle_balance(struct rq *this_rq) { int this_cpu = this_rq->cpu; - /* - * This CPU doesn't want to be disturbed by scheduler - * housekeeping - */ - if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED)) - return; - /* Will wake up very soon. No time for doing anything else*/ if (this_rq->avg_idle < sysctl_sched_migration_cost) return; @@ -12759,11 +12879,11 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) * have been enqueued in the meantime. Since we're not going idle, * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) + if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) + if (this_rq->nr_running != this_rq->cfs.h_nr_queued) pulled_task = -1; out: @@ -12784,9 +12904,9 @@ out: /* * This softirq handler is triggered via SCHED_SOFTIRQ from two places: * - * - directly from the local scheduler_tick() for periodic load balancing + * - directly from the local sched_tick() for periodic load balancing * - * - indirectly from a remote scheduler_tick() for NOHZ idle balancing + * - indirectly from a remote sched_tick() for NOHZ idle balancing * through the SMP cross-call nohz_csd_func() */ static __latent_entropy void sched_balance_softirq(void) @@ -12877,7 +12997,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check * if we need to give up the CPU. */ - if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && + if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 && __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) resched_curr(rq); } @@ -12921,7 +13041,7 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, struct cfs_rq *cfs_rqb; s64 delta; - SCHED_WARN_ON(task_rq(b)->core != rq->core); + WARN_ON_ONCE(task_rq(b)->core != rq->core); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -13021,7 +13141,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (rq->cfs.nr_running == 1) + if (rq->cfs.nr_queued == 1) return; /* @@ -13124,7 +13244,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) static void switched_to_fair(struct rq *rq, struct task_struct *p) { - SCHED_WARN_ON(p->se.sched_delayed); + WARN_ON_ONCE(p->se.sched_delayed); attach_task_cfs_rq(p); @@ -13159,7 +13279,7 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs if (!first) return; - SCHED_WARN_ON(se->sched_delayed); + WARN_ON_ONCE(se->sched_delayed); if (hrtick_enabled_fair(rq)) hrtick_start_fair(rq, p); @@ -13431,7 +13551,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); struct sched_entity *se = tg->se[i]; - struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf; @@ -13442,16 +13562,8 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) goto next_cpu; - if (se->on_rq) { - parent_cfs_rq = cfs_rq_of(se); - if (cfs_rq_is_idle(grp_cfs_rq)) - parent_cfs_rq->idle_nr_running++; - else - parent_cfs_rq->idle_nr_running--; - } - - idle_task_delta = grp_cfs_rq->h_nr_running - - grp_cfs_rq->idle_h_nr_running; + idle_task_delta = grp_cfs_rq->h_nr_queued - + grp_cfs_rq->h_nr_idle; if (!cfs_rq_is_idle(grp_cfs_rq)) idle_task_delta *= -1; @@ -13461,7 +13573,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (!se->on_rq) break; - cfs_rq->idle_h_nr_running += idle_task_delta; + cfs_rq->h_nr_idle += idle_task_delta; /* Already accounted at parent level and above. */ if (cfs_rq_is_idle(cfs_rq)) @@ -13554,7 +13666,6 @@ DEFINE_SCHED_CLASS(fair) = { #endif }; -#ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq, *pos; @@ -13588,7 +13699,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) rcu_read_unlock(); } #endif /* CONFIG_NUMA_BALANCING */ -#endif /* CONFIG_SCHED_DEBUG */ __init void init_sched_fair_class(void) { diff --git a/kernel/sched/features.h b/kernel/sched/features.h index a3d331dd2d8f..3c12d9f93331 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -32,6 +32,15 @@ SCHED_FEAT(PREEMPT_SHORT, true) SCHED_FEAT(NEXT_BUDDY, false) /* + * Allow completely ignoring cfs_rq->next; which can be set from various + * places: + * - NEXT_BUDDY (wakeup preemption) + * - yield_to_task() + * - cgroup dequeue / pick + */ +SCHED_FEAT(PICK_BUDDY, true) + +/* * Consider buddies to be cache hot, decreases the likeliness of a * cache buddy being migrated away, increases cache locality. */ diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 5891e715f00d..81bc8b329ef1 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -9,15 +9,9 @@ */ enum hk_flags { - HK_FLAG_TIMER = BIT(HK_TYPE_TIMER), - HK_FLAG_RCU = BIT(HK_TYPE_RCU), - HK_FLAG_MISC = BIT(HK_TYPE_MISC), - HK_FLAG_SCHED = BIT(HK_TYPE_SCHED), - HK_FLAG_TICK = BIT(HK_TYPE_TICK), HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), - HK_FLAG_WQ = BIT(HK_TYPE_WQ), HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), - HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD), + HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE), }; DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); @@ -97,7 +91,7 @@ void __init housekeeping_init(void) static_branch_enable(&housekeeping_overridden); - if (housekeeping.flags & HK_FLAG_TICK) + if (housekeeping.flags & HK_FLAG_KERNEL_NOISE) sched_tick_offload_init(); for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) { @@ -121,7 +115,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) unsigned int first_cpu; int err = 0; - if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) { + if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) { if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) { pr_warn("Housekeeping: nohz unsupported." " Build with CONFIG_NO_HZ_FULL\n"); @@ -177,7 +171,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) housekeeping_setup_type(type, housekeeping_staging); } - if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) + if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) tick_nohz_full_setup(non_housekeeping_mask); housekeeping.flags |= flags; @@ -195,8 +189,7 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned long flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | - HK_FLAG_MISC | HK_FLAG_KTHREAD; + flags = HK_FLAG_KERNEL_NOISE; return housekeeping_setup(str, flags); } @@ -210,9 +203,12 @@ static int __init housekeeping_isolcpus_setup(char *str) int len; while (isalpha(*str)) { + /* + * isolcpus=nohz is equivalent to nohz_full. + */ if (!strncmp(str, "nohz,", 5)) { str += 5; - flags |= HK_FLAG_TICK; + flags |= HK_FLAG_KERNEL_NOISE; continue; } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index fee75cc2c47b..7a8534a2deff 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = grq->h_nr_running + * se_runnable() = grq->h_nr_runnable * * runnable_sum = se_runnable() * runnable = grq->runnable_sum * runnable_avg = runnable_sum @@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->h_nr_running - cfs_rq->h_nr_delayed, + cfs_rq->h_nr_runnable, cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 84dad1511d1e..bb56805e3d47 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -998,7 +998,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st s64 delta; u64 irq; - if (static_branch_likely(&psi_disabled)) + if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) return; if (!curr->pid) @@ -1240,6 +1240,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (!irqtime_enabled() && res == PSI_IRQ) + return -EOPNOTSUPP; +#endif + /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index bd66a46b06ac..fa03ec3ed56a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -26,7 +26,7 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff size_t *lenp, loff_t *ppos); static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -static struct ctl_table sched_rt_sysctls[] = { +static const struct ctl_table sched_rt_sysctls[] = { { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, @@ -127,9 +127,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) raw_spin_lock_init(&rt_b->rt_runtime_lock); - hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_HARD); - rt_b->rt_period_timer.function = sched_rt_period_timer; + hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD); } static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) @@ -169,9 +168,8 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_SCHED_DEBUG WARN_ON_ONCE(!rt_entity_is_task(rt_se)); -#endif + return container_of(rt_se, struct task_struct, rt); } @@ -1713,7 +1711,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) BUG_ON(idx >= MAX_RT_PRIO); queue = array->queue + idx; - if (SCHED_WARN_ON(list_empty(queue))) + if (WARN_ON_ONCE(list_empty(queue))) return NULL; next = list_entry(queue->next, struct sched_rt_entity, run_list); @@ -2910,6 +2908,7 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff int ret; mutex_lock(&mutex); + sched_domains_mutex_lock(); old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; @@ -2936,6 +2935,7 @@ undo: sysctl_sched_rt_period = old_period; sysctl_sched_rt_runtime = old_runtime; } + sched_domains_mutex_unlock(); mutex_unlock(&mutex); return ret; @@ -2967,7 +2967,6 @@ static int sched_rr_handler(const struct ctl_table *table, int write, void *buff } #endif /* CONFIG_SYSCTL */ -#ifdef CONFIG_SCHED_DEBUG void print_rt_stats(struct seq_file *m, int cpu) { rt_rq_iter_t iter; @@ -2978,4 +2977,3 @@ void print_rt_stats(struct seq_file *m, int cpu) print_rt_rq(m, cpu, rt_rq); rcu_read_unlock(); } -#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c5d67a43fe52..47972f34ea70 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -91,12 +91,6 @@ struct cpuidle_state; #include "cpupri.h" #include "cpudeadline.h" -#ifdef CONFIG_SCHED_DEBUG -# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -#else -# define SCHED_WARN_ON(x) ({ (void)(x), 0; }) -#endif - /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -362,7 +356,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern int dl_bw_check_overflow(int cpu); +extern int dl_bw_deactivate(int cpu); extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); /* * SCHED_DEADLINE supports servers (nested scheduling) with the following @@ -572,7 +566,7 @@ extern void sched_online_group(struct task_group *tg, extern void sched_destroy_group(struct task_group *tg); extern void sched_release_group(struct task_group *tg); -extern void sched_move_task(struct task_struct *tsk); +extern void sched_move_task(struct task_struct *tsk, bool for_autogroup); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); @@ -650,11 +644,10 @@ struct balance_callback { /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned int nr_running; - unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int idle_nr_running; /* SCHED_IDLE */ - unsigned int idle_h_nr_running; /* SCHED_IDLE */ - unsigned int h_nr_delayed; + unsigned int nr_queued; + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_idle; /* SCHED_IDLE */ s64 avg_vruntime; u64 avg_load; @@ -760,6 +753,7 @@ enum scx_rq_flags { SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ SCX_RQ_BYPASSING = 1 << 4, + SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ SCX_RQ_IN_WAKEUP = 1 << 16, SCX_RQ_IN_BALANCE = 1 << 17, @@ -772,9 +766,10 @@ struct scx_rq { unsigned long ops_qseq; u64 extra_enq_flags; /* see move_task_to_local_dsq() */ u32 nr_running; - u32 flags; u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ bool cpu_released; + u32 flags; + u64 clock; /* current per-rq clock -- see scx_bpf_now() */ cpumask_var_t cpus_to_kick; cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_preempt; @@ -904,11 +899,8 @@ struct dl_rq { static inline void se_update_runnable(struct sched_entity *se) { - if (!entity_is_task(se)) { - struct cfs_rq *cfs_rq = se->my_q; - - se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed; - } + if (!entity_is_task(se)) + se->runnable_weight = se->my_q->h_nr_runnable; } static inline long se_runnable(struct sched_entity *se) @@ -1000,7 +992,7 @@ struct root_domain { * Also, some corner cases, like 'wrap around' is dangerous, but given * that u64 is 'big enough'. So that shouldn't be a concern. */ - u64 visit_gen; + u64 visit_cookie; #ifdef HAVE_RT_PUSH_IPI /* @@ -1182,10 +1174,8 @@ struct rq { atomic_t nr_iowait; -#ifdef CONFIG_SCHED_DEBUG u64 last_seen_need_resched_ns; int ticks_without_resched; -#endif #ifdef CONFIG_MEMBARRIER int membarrier_state; @@ -1573,7 +1563,7 @@ static inline void update_idle_core(struct rq *rq) { } static inline struct task_struct *task_of(struct sched_entity *se) { - SCHED_WARN_ON(!entity_is_task(se)); + WARN_ON_ONCE(!entity_is_task(se)); return container_of(se, struct task_struct, se); } @@ -1654,7 +1644,7 @@ static inline void assert_clock_updated(struct rq *rq) * The only reason for not seeing a clock update since the * last rq_pin_lock() is if we're currently skipping updates. */ - SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); + WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP); } static inline u64 rq_clock(struct rq *rq) @@ -1701,7 +1691,7 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq) static inline void rq_clock_start_loop_update(struct rq *rq) { lockdep_assert_rq_held(rq); - SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP); + WARN_ON_ONCE(rq->clock_update_flags & RQCF_ACT_SKIP); rq->clock_update_flags |= RQCF_ACT_SKIP; } @@ -1714,18 +1704,48 @@ static inline void rq_clock_stop_loop_update(struct rq *rq) struct rq_flags { unsigned long flags; struct pin_cookie cookie; -#ifdef CONFIG_SCHED_DEBUG /* * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the * current pin context is stashed here in case it needs to be * restored in rq_repin_lock(). */ unsigned int clock_update_flags; -#endif }; extern struct balance_callback balance_push_callback; +#ifdef CONFIG_SCHED_CLASS_EXT +extern const struct sched_class ext_sched_class; + +DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */ +DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */ + +#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) +#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) + +static inline void scx_rq_clock_update(struct rq *rq, u64 clock) +{ + if (!scx_enabled()) + return; + WRITE_ONCE(rq->scx.clock, clock); + smp_store_release(&rq->scx.flags, rq->scx.flags | SCX_RQ_CLK_VALID); +} + +static inline void scx_rq_clock_invalidate(struct rq *rq) +{ + if (!scx_enabled()) + return; + WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID); +} + +#else /* !CONFIG_SCHED_CLASS_EXT */ +#define scx_enabled() false +#define scx_switched_all() false + +static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {} +static inline void scx_rq_clock_invalidate(struct rq *rq) {} +#endif /* !CONFIG_SCHED_CLASS_EXT */ + /* * Lockdep annotation that avoids accidental unlocks; it's like a * sticky/continuous lockdep_assert_held(). @@ -1740,22 +1760,19 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) { rf->cookie = lockdep_pin_lock(__rq_lockp(rq)); -#ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -# ifdef CONFIG_SMP - SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); -# endif +#ifdef CONFIG_SMP + WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); #endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) { -#ifdef CONFIG_SCHED_DEBUG if (rq->clock_update_flags > RQCF_ACT_SKIP) rf->clock_update_flags = RQCF_UPDATED; -#endif + scx_rq_clock_invalidate(rq); lockdep_unpin_lock(__rq_lockp(rq), rf->cookie); } @@ -1763,12 +1780,10 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) { lockdep_repin_lock(__rq_lockp(rq), rf->cookie); -#ifdef CONFIG_SCHED_DEBUG /* * Restore the value we stashed in @rf for this pin context. */ rq->clock_update_flags |= rf->clock_update_flags; -#endif } extern @@ -2042,9 +2057,7 @@ struct sched_group_capacity { unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ -#ifdef CONFIG_SCHED_DEBUG int id; -#endif unsigned long cpumask[]; /* Balance mask */ }; @@ -2084,13 +2097,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) extern int group_balance_cpu(struct sched_group *sg); -#ifdef CONFIG_SCHED_DEBUG extern void update_sched_domain_debugfs(void); extern void dirty_sched_domain_sysctl(int cpu); -#else -static inline void update_sched_domain_debugfs(void) { } -static inline void dirty_sched_domain_sysctl(int cpu) { } -#endif extern int sched_update_scaling(void); @@ -2170,13 +2178,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) } /* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + * Tunables: */ -#ifdef CONFIG_SCHED_DEBUG -# define const_debug __read_mostly -#else -# define const_debug const -#endif #define SCHED_FEAT(name, enabled) \ __SCHED_FEAT_##name , @@ -2188,13 +2191,11 @@ enum { #undef SCHED_FEAT -#ifdef CONFIG_SCHED_DEBUG - /* * To support run-time toggling of sched features, all the translation units * (but core.c) reference the sysctl_sched_features defined in core.c. */ -extern const_debug unsigned int sysctl_sched_features; +extern __read_mostly unsigned int sysctl_sched_features; #ifdef CONFIG_JUMP_LABEL @@ -2216,24 +2217,6 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* !CONFIG_JUMP_LABEL */ -#else /* !SCHED_DEBUG: */ - -/* - * Each translation unit has its own copy of sysctl_sched_features to allow - * constants propagation at compile time and compiler optimization based on - * features default. - */ -#define SCHED_FEAT(name, enabled) \ - (1UL << __SCHED_FEAT_##name) * enabled | -static const_debug __maybe_unused unsigned int sysctl_sched_features = -#include "features.h" - 0; -#undef SCHED_FEAT - -#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) - -#endif /* !SCHED_DEBUG */ - extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; @@ -2280,7 +2263,7 @@ static inline int task_on_cpu(struct rq *rq, struct task_struct *p) static inline int task_on_rq_queued(struct task_struct *p) { - return p->on_rq == TASK_ON_RQ_QUEUED; + return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED; } static inline int task_on_rq_migrating(struct task_struct *p) @@ -2514,19 +2497,6 @@ extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; -#ifdef CONFIG_SCHED_CLASS_EXT -extern const struct sched_class ext_sched_class; - -DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */ -DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */ - -#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) -#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) -#else /* !CONFIG_SCHED_CLASS_EXT */ -#define scx_enabled() false -#define scx_switched_all() false -#endif /* !CONFIG_SCHED_CLASS_EXT */ - /* * Iterate only active classes. SCX can take over all fair tasks or be * completely disabled. If the former, skip fair. If the latter, skip SCX. @@ -2574,7 +2544,7 @@ static inline bool sched_rt_runnable(struct rq *rq) static inline bool sched_fair_runnable(struct rq *rq) { - return rq->cfs.nr_running > 0; + return rq->cfs.nr_queued > 0; } extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); @@ -2668,7 +2638,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { - SCHED_WARN_ON(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held()); return rq->idle_state; } @@ -2826,12 +2796,11 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); # define SCHED_NR_MIGRATE_BREAK 32 #endif -extern const_debug unsigned int sysctl_sched_nr_migrate; -extern const_debug unsigned int sysctl_sched_migration_cost; +extern __read_mostly unsigned int sysctl_sched_nr_migrate; +extern __read_mostly unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_base_slice; -#ifdef CONFIG_SCHED_DEBUG extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; @@ -2842,7 +2811,6 @@ extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_hot_threshold; -#endif #ifdef CONFIG_SCHED_HRTICK @@ -2915,7 +2883,6 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif -#ifdef CONFIG_SCHED_DEBUG /* * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to * acquire rq lock instead of rq_lock(). So at the end of these two functions @@ -2930,9 +2897,6 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); #endif } -#else -static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { } -#endif #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ __DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ @@ -3145,7 +3109,6 @@ extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); -#ifdef CONFIG_SCHED_DEBUG extern bool sched_debug_verbose; extern void print_cfs_stats(struct seq_file *m, int cpu); @@ -3156,15 +3119,12 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); -# ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void print_numa_stats(struct seq_file *m, int node, unsigned long tsf, unsigned long tpf, unsigned long gsf, unsigned long gpf); -# endif /* CONFIG_NUMA_BALANCING */ -#else /* !CONFIG_SCHED_DEBUG: */ -static inline void resched_latency_warn(int cpu, u64 latency) { } -#endif /* !CONFIG_SCHED_DEBUG */ +#endif /* CONFIG_NUMA_BALANCING */ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); @@ -3242,6 +3202,12 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +extern int sched_clock_irqtime; + +static inline int irqtime_enabled(void) +{ + return sched_clock_irqtime; +} /* * Returns the irqtime minus the softirq time computed by ksoftirqd. @@ -3262,6 +3228,13 @@ static inline u64 irq_time_read(int cpu) return total; } +#else + +static inline int irqtime_enabled(void) +{ + return 0; +} + #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ @@ -3364,6 +3337,31 @@ static inline bool update_other_load_avgs(struct rq *rq) { return false; } unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); +/* + * When uclamp is compiled in, the aggregation at rq level is 'turned off' + * by default in the fast path and only gets turned on once userspace performs + * an operation that requires it. + * + * Returns true if userspace opted-in to use uclamp and aggregation at rq level + * hence is active. + */ +static inline bool uclamp_is_used(void) +{ + return static_branch_likely(&sched_uclamp_used); +} + +/* + * Enabling static branches would get the cpus_read_lock(), + * check whether uclamp_is_used before enable it to avoid always + * calling cpus_read_lock(). Because we never disable this + * static key once enable it. + */ +static inline void sched_uclamp_enable(void) +{ + if (!uclamp_is_used()) + static_branch_enable(&sched_uclamp_used); +} + static inline unsigned long uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { @@ -3387,7 +3385,7 @@ static inline bool uclamp_rq_is_capped(struct rq *rq) unsigned long rq_util; unsigned long max_util; - if (!static_branch_likely(&sched_uclamp_used)) + if (!uclamp_is_used()) return false; rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq); @@ -3396,19 +3394,6 @@ static inline bool uclamp_rq_is_capped(struct rq *rq) return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util; } -/* - * When uclamp is compiled in, the aggregation at rq level is 'turned off' - * by default in the fast path and only gets turned on once userspace performs - * an operation that requires it. - * - * Returns true if userspace opted-in to use uclamp and aggregation at rq level - * hence is active. - */ -static inline bool uclamp_is_used(void) -{ - return static_branch_likely(&sched_uclamp_used); -} - #define for_each_clamp_id(clamp_id) \ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) @@ -3456,6 +3441,8 @@ static inline bool uclamp_is_used(void) return false; } +static inline void sched_uclamp_enable(void) {} + static inline unsigned long uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id) { @@ -3509,6 +3496,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned #endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */ +extern void __setparam_fair(struct task_struct *p, const struct sched_attr *attr); + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) @@ -3587,6 +3576,7 @@ extern int preempt_dynamic_mode; extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +extern const char *preempt_modes[]; #ifdef CONFIG_SCHED_MM_CID @@ -3666,10 +3656,28 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { struct cpumask *cidmask = mm_cidmask(mm); struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid = __this_cpu_read(pcpu_cid->recent_cid); + int cid, max_nr_cid, allowed_max_nr_cid; + /* + * After shrinking the number of threads or reducing the number + * of allowed cpus, reduce the value of max_nr_cid so expansion + * of cid allocation will preserve cache locality if the number + * of threads or allowed cpus increase again. + */ + max_nr_cid = atomic_read(&mm->max_nr_cid); + while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), + atomic_read(&mm->mm_users))), + max_nr_cid > allowed_max_nr_cid) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ + if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { + max_nr_cid = allowed_max_nr_cid; + break; + } + } /* Try to re-use recent cid. This improves cache locality. */ - if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask)) + cid = __this_cpu_read(pcpu_cid->recent_cid); + if (!mm_cid_is_unset(cid) && cid < max_nr_cid && + !cpumask_test_and_set_cpu(cid, cidmask)) return cid; /* * Expand cid allocation if the maximum number of concurrency @@ -3677,8 +3685,9 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) * and number of threads. Expanding cid allocation as much as * possible improves cache locality. */ - cid = atomic_read(&mm->max_nr_cid); + cid = max_nr_cid; while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) continue; if (!cpumask_test_and_set_cpu(cid, cidmask)) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index eb0cdcd4d921..4346fd81c31f 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -103,7 +103,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 16 +#define SCHEDSTAT_VERSION 17 static int show_schedstat(struct seq_file *seq, void *v) { @@ -138,14 +138,17 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - seq_printf(seq, "domain%d %*pb", dcount++, + seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name, cpumask_pr_args(sched_domain_span(sd))); for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) { - seq_printf(seq, " %u %u %u %u %u %u %u %u", + seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u", sd->lb_count[itype], sd->lb_balanced[itype], sd->lb_failed[itype], - sd->lb_imbalance[itype], + sd->lb_imbalance_load[itype], + sd->lb_imbalance_util[itype], + sd->lb_imbalance_task[itype], + sd->lb_imbalance_misfit[itype], sd->lb_gained[itype], sd->lb_hot_gained[itype], sd->lb_nobusyq[itype], diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8ee0add5a48a..452826df6ae1 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -138,9 +138,13 @@ static inline void psi_enqueue(struct task_struct *p, int flags) if (flags & ENQUEUE_RESTORE) return; + /* psi_sched_switch() will handle the flags */ + if (task_on_cpu(task_rq(p), p)) + return; + if (p->se.sched_delayed) { /* CPU migration of "sleeping" task */ - SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); + WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED)); if (p->in_memstall) set |= TSK_MEMSTALL; if (p->in_iowait) @@ -244,7 +248,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) delta = rq_clock(rq) - t->sched_info.last_queued; t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; - + if (delta > t->sched_info.max_run_delay) + t->sched_info.max_run_delay = delta; + if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) + t->sched_info.min_run_delay = delta; rq_sched_info_dequeue(rq, delta); } @@ -266,6 +273,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; + if (delta > t->sched_info.max_run_delay) + t->sched_info.max_run_delay = delta; + if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) + t->sched_info.min_run_delay = delta; rq_sched_info_arrive(rq, delta); } diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index ff0e5ab4e37c..c326de1344fb 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -300,20 +300,10 @@ static void __setscheduler_params(struct task_struct *p, p->policy = policy; - if (dl_policy(policy)) { + if (dl_policy(policy)) __setparam_dl(p, attr); - } else if (fair_policy(policy)) { - p->static_prio = NICE_TO_PRIO(attr->sched_nice); - if (attr->sched_runtime) { - p->se.custom_slice = 1; - p->se.slice = clamp_t(u64, attr->sched_runtime, - NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ - NSEC_PER_MSEC*100); /* HZ=100 / 10 */ - } else { - p->se.custom_slice = 0; - p->se.slice = sysctl_sched_base_slice; - } - } + else if (fair_policy(policy)) + __setparam_fair(p, attr); /* rt-policy tasks do not have a timerslack */ if (rt_or_dl_task_policy(p)) { @@ -378,7 +368,7 @@ static int uclamp_validate(struct task_struct *p, * blocking operation which obviously cannot be done while holding * scheduler locks. */ - static_branch_enable(&sched_uclamp_used); + sched_uclamp_enable(); return 0; } @@ -885,7 +875,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lparam; - if (!param || pid < 0) + if (unlikely(!param || pid < 0)) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; @@ -994,7 +984,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, struct sched_attr attr; int retval; - if (!uattr || pid < 0 || flags) + if (unlikely(!uattr || pid < 0 || flags)) return -EINVAL; retval = sched_copy_attr(uattr, &attr); @@ -1059,7 +1049,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) struct task_struct *p; int retval; - if (!param || pid < 0) + if (unlikely(!param || pid < 0)) return -EINVAL; scoped_guard (rcu) { @@ -1095,8 +1085,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, struct task_struct *p; int retval; - if (!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags) + if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || + usize < SCHED_ATTR_SIZE_VER0 || flags)) return -EINVAL; scoped_guard (rcu) { @@ -1140,6 +1130,13 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) return 0; /* + * The special/sugov task isn't part of regular bandwidth/admission + * control so let userspace change affinities. + */ + if (dl_entity_is_special(&p->dl)) + return 0; + + /* * Since bandwidth control happens on root_domain basis, * if admission test is enabled, we only admit -deadline * tasks allowed to run on all the CPUs in the task's @@ -1433,7 +1430,7 @@ int __sched yield_to(struct task_struct *p, bool preempt) struct rq *rq, *p_rq; int yielded = 0; - scoped_guard (irqsave) { + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { rq = this_rq(); again: diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9748a4c8d668..f1ebc60d967f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -6,13 +6,19 @@ #include <linux/bsearch.h> DEFINE_MUTEX(sched_domains_mutex); +void sched_domains_mutex_lock(void) +{ + mutex_lock(&sched_domains_mutex); +} +void sched_domains_mutex_unlock(void) +{ + mutex_unlock(&sched_domains_mutex); +} /* Protected by sched_domains_mutex: */ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; -#ifdef CONFIG_SCHED_DEBUG - static int __init sched_debug_setup(char *str) { sched_debug_verbose = true; @@ -151,15 +157,6 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) break; } } -#else /* !CONFIG_SCHED_DEBUG */ - -# define sched_debug_verbose 0 -# define sched_domain_debug(sd, cpu) do { } while (0) -static inline bool sched_debug(void) -{ - return false; -} -#endif /* CONFIG_SCHED_DEBUG */ /* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | @@ -312,7 +309,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write, return ret; } -static struct ctl_table sched_energy_aware_sysctls[] = { +static const struct ctl_table sched_energy_aware_sysctls[] = { { .procname = "sched_energy_aware", .data = &sysctl_sched_energy_aware, @@ -560,7 +557,7 @@ static int init_rootdomain(struct root_domain *rd) rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif - rd->visit_gen = 0; + rd->visit_cookie = 0; init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; @@ -1635,9 +1632,7 @@ sd_init(struct sched_domain_topology_level *tl, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, -#ifdef CONFIG_SCHED_DEBUG .name = tl->name, -#endif }; sd_span = sched_domain_span(sd); @@ -2277,9 +2272,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sgc) return -ENOMEM; -#ifdef CONFIG_SCHED_DEBUG sgc->id = j; -#endif *per_cpu_ptr(sdd->sgc, j) = sgc; } @@ -2338,10 +2331,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { pr_err("BUG: arch topology borken\n"); -#ifdef CONFIG_SCHED_DEBUG pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); -#endif /* Fixup, ensure @sd has at least @child CPUs. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), @@ -2684,7 +2675,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * Call with hotplug lock and sched_domains_mutex held */ -void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], +static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { bool __maybe_unused has_eas = false; @@ -2716,19 +2707,8 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_cur[i], doms_new[j]) && - dattrs_equal(dattr_cur, i, dattr_new, j)) { - struct root_domain *rd; - - /* - * This domain won't be destroyed and as such - * its dl_bw->total_bw needs to be cleared. It - * will be recomputed in function - * update_tasks_root_domain(). - */ - rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; - dl_clear_root_domain(rd); + dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; - } } /* No match - a current sched domain not in new doms_new[] */ detach_destroy_domains(doms_cur[i]); @@ -2785,6 +2765,7 @@ match3: ndoms_cur = ndoms_new; update_sched_domain_debugfs(); + dl_rebuild_rd_accounting(); } /* @@ -2793,7 +2774,7 @@ match3: void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { - mutex_lock(&sched_domains_mutex); + sched_domains_mutex_lock(); partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); + sched_domains_mutex_unlock(); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 385d48293a5f..41aa761c7738 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -29,13 +29,11 @@ #include <linux/syscalls.h> #include <linux/sysctl.h> +#include <asm/syscall.h> + /* Not exposed in headers: strictly internal use only. */ #define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) -#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER -#include <asm/syscall.h> -#endif - #ifdef CONFIG_SECCOMP_FILTER #include <linux/file.h> #include <linux/filter.h> @@ -576,6 +574,9 @@ void seccomp_filter_release(struct task_struct *tsk) if (WARN_ON((tsk->flags & PF_EXITING) == 0)) return; + if (READ_ONCE(tsk->seccomp.filter) == NULL) + return; + spin_lock_irq(&tsk->sighand->siglock); orig = tsk->seccomp.filter; /* Detach task from its filter tree. */ @@ -601,6 +602,13 @@ static inline void seccomp_sync_threads(unsigned long flags) BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); assert_spin_locked(¤t->sighand->siglock); + /* + * Don't touch any of the threads if the process is being killed. + * This allows for a lockless check in seccomp_filter_release. + */ + if (current->signal->flags & SIGNAL_GROUP_EXIT) + return; + /* Synchronize all threads. */ caller = current; for_each_thread(caller, thread) { @@ -749,6 +757,15 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, if (WARN_ON_ONCE(!fprog)) return false; + /* Our single exception to filtering. */ +#ifdef __NR_uretprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + if (sd->nr == __NR_uretprobe) + return true; +#endif + for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; u16 code = insn->code; @@ -1023,6 +1040,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, */ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, +#ifdef __NR_uretprobe + __NR_uretprobe, +#endif -1, /* negative terminated */ }; @@ -1062,6 +1082,13 @@ void secure_computing_strict(int this_syscall) else BUG(); } +int __secure_computing(void) +{ + int this_syscall = syscall_get_nr(current, current_pt_regs()); + + secure_computing_strict(this_syscall); + return 0; +} #else #ifdef CONFIG_SECCOMP_FILTER @@ -1213,13 +1240,12 @@ out: return -1; } -static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, - const bool recheck_after_trace) +static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) { u32 filter_ret, action; + struct seccomp_data sd; struct seccomp_filter *match = NULL; int data; - struct seccomp_data sd_local; /* * Make sure that any changes to mode from another thread have @@ -1227,12 +1253,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, */ smp_rmb(); - if (!sd) { - populate_seccomp_data(&sd_local); - sd = &sd_local; - } + populate_seccomp_data(&sd); - filter_ret = seccomp_run_filters(sd, &match); + filter_ret = seccomp_run_filters(&sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; @@ -1290,13 +1313,13 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, * a reload of all registers. This does not goto skip since * a skip would have already been reported. */ - if (__seccomp_filter(this_syscall, NULL, true)) + if (__seccomp_filter(this_syscall, true)) return -1; return 0; case SECCOMP_RET_USER_NOTIF: - if (seccomp_do_user_notification(this_syscall, match, sd)) + if (seccomp_do_user_notification(this_syscall, match, &sd)) goto skip; return 0; @@ -1338,8 +1361,7 @@ skip: return -1; } #else -static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, - const bool recheck_after_trace) +static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) { BUG(); @@ -1347,7 +1369,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, } #endif -int __secure_computing(const struct seccomp_data *sd) +int __secure_computing(void) { int mode = current->seccomp.mode; int this_syscall; @@ -1356,15 +1378,14 @@ int __secure_computing(const struct seccomp_data *sd) unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return 0; - this_syscall = sd ? sd->nr : - syscall_get_nr(current, current_pt_regs()); + this_syscall = syscall_get_nr(current, current_pt_regs()); switch (mode) { case SECCOMP_MODE_STRICT: __secure_computing_strict(this_syscall); /* may call do_exit */ return 0; case SECCOMP_MODE_FILTER: - return __seccomp_filter(this_syscall, sd, false); + return __seccomp_filter(this_syscall, false); /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */ case SECCOMP_MODE_DEAD: WARN_ON_ONCE(1); @@ -2450,7 +2471,7 @@ static int seccomp_actions_logged_handler(const struct ctl_table *ro_table, int return ret; } -static struct ctl_table seccomp_sysctl_table[] = { +static const struct ctl_table seccomp_sysctl_table[] = { { .procname = "actions_avail", .data = (void *) &seccomp_actions_avail, diff --git a/kernel/signal.c b/kernel/signal.c index a2afd54303f0..86ba66d95da5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2092,7 +2092,7 @@ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueu * from a non-periodic timer, then just drop the reference * count. Otherwise queue it on the ignored list. */ - if (tmr->it_signal && tmr->it_sig_periodic) + if (posixtimer_valid(tmr) && tmr->it_sig_periodic) hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers); else posixtimer_putref(tmr); @@ -2180,8 +2180,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); /* - * tsk is a group leader and has no threads, wake up the - * non-PIDFD_THREAD waiters. + * Notify for thread-group leaders without subthreads. */ if (thread_group_empty(tsk)) do_notify_pidfd(tsk); @@ -4009,6 +4008,47 @@ static struct pid *pidfd_to_pid(const struct file *file) (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ PIDFD_SIGNAL_PROCESS_GROUP) +static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type, + siginfo_t __user *info, unsigned int flags) +{ + kernel_siginfo_t kinfo; + + switch (flags) { + case PIDFD_SIGNAL_THREAD: + type = PIDTYPE_PID; + break; + case PIDFD_SIGNAL_THREAD_GROUP: + type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_PROCESS_GROUP: + type = PIDTYPE_PGID; + break; + } + + if (info) { + int ret; + + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + return ret; + + if (unlikely(sig != kinfo.si_signo)) + return -EINVAL; + + /* Only allow sending arbitrary signals to yourself. */ + if ((task_pid(current) != pid || type > PIDTYPE_TGID) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) + return -EPERM; + } else { + prepare_kill_siginfo(sig, &kinfo, type); + } + + if (type == PIDTYPE_PGID) + return kill_pgrp_info(sig, &kinfo, pid); + + return kill_pid_info_type(sig, &kinfo, pid, type); +} + /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process @@ -4026,9 +4066,7 @@ static struct pid *pidfd_to_pid(const struct file *file) SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { - int ret; struct pid *pid; - kernel_siginfo_t kinfo; enum pid_type type; /* Enforce flags be set to 0 until we add an extension. */ @@ -4039,57 +4077,39 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; - CLASS(fd, f)(pidfd); - if (fd_empty(f)) - return -EBADF; + switch (pidfd) { + case PIDFD_SELF_THREAD: + pid = get_task_pid(current, PIDTYPE_PID); + type = PIDTYPE_PID; + break; + case PIDFD_SELF_THREAD_GROUP: + pid = get_task_pid(current, PIDTYPE_TGID); + type = PIDTYPE_TGID; + break; + default: { + CLASS(fd, f)(pidfd); + if (fd_empty(f)) + return -EBADF; - /* Is this a pidfd? */ - pid = pidfd_to_pid(fd_file(f)); - if (IS_ERR(pid)) - return PTR_ERR(pid); + /* Is this a pidfd? */ + pid = pidfd_to_pid(fd_file(f)); + if (IS_ERR(pid)) + return PTR_ERR(pid); - if (!access_pidfd_pidns(pid)) - return -EINVAL; + if (!access_pidfd_pidns(pid)) + return -EINVAL; - switch (flags) { - case 0: /* Infer scope from the type of pidfd. */ if (fd_file(f)->f_flags & PIDFD_THREAD) type = PIDTYPE_PID; else type = PIDTYPE_TGID; - break; - case PIDFD_SIGNAL_THREAD: - type = PIDTYPE_PID; - break; - case PIDFD_SIGNAL_THREAD_GROUP: - type = PIDTYPE_TGID; - break; - case PIDFD_SIGNAL_PROCESS_GROUP: - type = PIDTYPE_PGID; - break; - } - if (info) { - ret = copy_siginfo_from_user_any(&kinfo, info); - if (unlikely(ret)) - return ret; - - if (unlikely(sig != kinfo.si_signo)) - return -EINVAL; - - /* Only allow sending arbitrary signals to yourself. */ - if ((task_pid(current) != pid || type > PIDTYPE_TGID) && - (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) - return -EPERM; - } else { - prepare_kill_siginfo(sig, &kinfo, type); + return do_pidfd_send_signal(pid, sig, type, info, flags); + } } - if (type == PIDTYPE_PGID) - return kill_pgrp_info(sig, &kinfo, pid); - else - return kill_pid_info_type(sig, &kinfo, pid, type); + return do_pidfd_send_signal(pid, sig, type, info, flags); } static int @@ -4950,7 +4970,7 @@ static inline void siginfo_buildtime_checks(void) } #if defined(CONFIG_SYSCTL) -static struct ctl_table signal_debug_table[] = { +static const struct ctl_table signal_debug_table[] = { #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { .procname = "exception-trace", diff --git a/kernel/smp.c b/kernel/smp.c index 27dc31a146a3..974f3a3962e8 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -170,9 +170,9 @@ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ -module_param(csd_lock_timeout, ulong, 0444); +module_param(csd_lock_timeout, ulong, 0644); static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */ -module_param(panic_on_ipistall, int, 0444); +module_param(panic_on_ipistall, int, 0644); static atomic_t csd_bug_count = ATOMIC_INIT(0); @@ -815,7 +815,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask, WARN_ON_ONCE(!in_task()); /* Check if we need local execution. */ - if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask)) + if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && + (!cond_func || cond_func(this_cpu, info))) run_local = true; /* Check if we need remote execution, i.e., any CPU excluding this one. */ @@ -868,7 +869,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, send_call_function_ipi_mask(cfd->cpumask_ipi); } - if (run_local && (!cond_func || cond_func(this_cpu, info))) { + if (run_local) { unsigned long flags; local_irq_save(flags); diff --git a/kernel/stackleak.c b/kernel/stackleak.c index 39fd620a7db6..bb65321761b4 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -15,6 +15,7 @@ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include <linux/jump_label.h> +#include <linux/string_choices.h> #include <linux/sysctl.h> #include <linux/init.h> @@ -41,10 +42,10 @@ static int stack_erasing_sysctl(const struct ctl_table *table, int write, static_branch_enable(&stack_erasing_bypass); pr_warn("stackleak: kernel stack erasing is %s\n", - state ? "enabled" : "disabled"); + str_enabled_disabled(state)); return ret; } -static struct ctl_table stackleak_sysctls[] = { +static const struct ctl_table stackleak_sysctls[] = { { .procname = "stack_erasing", .data = NULL, diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index da821ce258ea..5d2d0562115b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -250,8 +250,9 @@ static int multi_cpu_stop(void *data) * be detected and reported on their side. */ touch_nmi_watchdog(); + /* Also suppress RCU CPU stall warnings. */ + rcu_momentary_eqs(); } - rcu_momentary_eqs(); } while (curstate != MULTI_STOP_EXIT); local_irq_restore(flags); diff --git a/kernel/sys.c b/kernel/sys.c index c4c701c6f0b4..c434968e9f5d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -75,6 +75,8 @@ #include <asm/io.h> #include <asm/unistd.h> +#include <trace/events/task.h> + #include "uid16.h" #ifndef SET_UNALIGN_CTL @@ -1083,6 +1085,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { struct task_struct *p; struct task_struct *group_leader = current->group_leader; + struct pid *pids[PIDTYPE_MAX] = { 0 }; struct pid *pgrp; int err; @@ -1140,13 +1143,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) goto out; if (task_pgrp(p) != pgrp) - change_pid(p, PIDTYPE_PGID, pgrp); + change_pid(pids, p, PIDTYPE_PGID, pgrp); err = 0; out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); rcu_read_unlock(); + free_pids(pids); return err; } @@ -1220,21 +1224,22 @@ out: return retval; } -static void set_special_pids(struct pid *pid) +static void set_special_pids(struct pid **pids, struct pid *pid) { struct task_struct *curr = current->group_leader; if (task_session(curr) != pid) - change_pid(curr, PIDTYPE_SID, pid); + change_pid(pids, curr, PIDTYPE_SID, pid); if (task_pgrp(curr) != pid) - change_pid(curr, PIDTYPE_PGID, pid); + change_pid(pids, curr, PIDTYPE_PGID, pid); } int ksys_setsid(void) { struct task_struct *group_leader = current->group_leader; struct pid *sid = task_pid(group_leader); + struct pid *pids[PIDTYPE_MAX] = { 0 }; pid_t session = pid_vnr(sid); int err = -EPERM; @@ -1250,13 +1255,14 @@ int ksys_setsid(void) goto out; group_leader->signal->leader = 1; - set_special_pids(sid); + set_special_pids(pids, sid); proc_clear_tty(group_leader); err = session; out: write_unlock_irq(&tasklist_lock); + free_pids(pids); if (err > 0) { proc_sid_connector(group_leader); sched_autogroup_create_attach(group_leader); @@ -2809,7 +2815,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; error = arch_lock_shadow_stack_status(me, arg2); break; + case PR_TIMER_CREATE_RESTORE_IDS: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = posixtimer_create_prctl(arg2); + break; default: + trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5); error = -EINVAL; break; } diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c index 3ac98bb7fb82..eb2842bd0557 100644 --- a/kernel/sysctl-test.c +++ b/kernel/sysctl-test.c @@ -374,7 +374,7 @@ static void sysctl_test_register_sysctl_sz_invalid_extra_value( struct kunit *test) { unsigned char data = 0; - struct ctl_table table_foo[] = { + const struct ctl_table table_foo[] = { { .procname = "foo", .data = &data, @@ -386,7 +386,7 @@ static void sysctl_test_register_sysctl_sz_invalid_extra_value( }, }; - struct ctl_table table_bar[] = { + const struct ctl_table table_bar[] = { { .procname = "bar", .data = &data, @@ -398,7 +398,7 @@ static void sysctl_test_register_sysctl_sz_invalid_extra_value( }, }; - struct ctl_table table_qux[] = { + const struct ctl_table table_qux[] = { { .procname = "qux", .data = &data, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7ae7a4136855..4ebe6136b08d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -54,7 +54,6 @@ #include <linux/acpi.h> #include <linux/reboot.h> #include <linux/ftrace.h> -#include <linux/perf_event.h> #include <linux/oom.h> #include <linux/kmod.h> #include <linux/capability.h> @@ -91,12 +90,6 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ - -#ifdef CONFIG_PERF_EVENTS -static const int six_hundred_forty_kb = 640 * 1024; -#endif - - static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -1609,7 +1602,7 @@ int proc_do_static_key(const struct ctl_table *table, int write, return ret; } -static struct ctl_table kern_table[] = { +static const struct ctl_table kern_table[] = { { .procname = "panic", .data = &panic_timeout, @@ -1831,16 +1824,6 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) - { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif - #if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ defined(CONFIG_DEBUG_STACKOVERFLOW) { @@ -1851,43 +1834,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_X86) - { - .procname = "panic_on_unrecovered_nmi", - .data = &panic_on_unrecovered_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "panic_on_io_nmi", - .data = &panic_on_io_nmi, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_type", - .data = &bootloader_type, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "bootloader_version", - .data = &bootloader_version, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "io_delay_type", - .data = &io_delay_type, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #if defined(CONFIG_MMU) { .procname = "randomize_va_space", @@ -1906,15 +1852,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) - { - .procname = "acpi_video_flags", - .data = &acpi_realmode_flags, - .maxlen = sizeof (unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", @@ -1933,63 +1870,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_PERF_EVENTS - /* - * User-space scripts rely on the existence of this file - * as a feature check for perf_events being enabled. - * - * So it's an ABI, do not remove! - */ - { - .procname = "perf_event_paranoid", - .data = &sysctl_perf_event_paranoid, - .maxlen = sizeof(sysctl_perf_event_paranoid), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_mlock_kb", - .data = &sysctl_perf_event_mlock, - .maxlen = sizeof(sysctl_perf_event_mlock), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "perf_event_max_sample_rate", - .data = &sysctl_perf_event_sample_rate, - .maxlen = sizeof(sysctl_perf_event_sample_rate), - .mode = 0644, - .proc_handler = perf_event_max_sample_rate_handler, - .extra1 = SYSCTL_ONE, - }, - { - .procname = "perf_cpu_time_max_percent", - .data = &sysctl_perf_cpu_time_max_percent, - .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), - .mode = 0644, - .proc_handler = perf_cpu_time_max_percent_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_HUNDRED, - }, - { - .procname = "perf_event_max_stack", - .data = &sysctl_perf_event_max_stack, - .maxlen = sizeof(sysctl_perf_event_max_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&six_hundred_forty_kb, - }, - { - .procname = "perf_event_max_contexts_per_stack", - .data = &sysctl_perf_event_max_contexts_per_stack, - .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack), - .mode = 0644, - .proc_handler = perf_event_max_stack_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE_THOUSAND, - }, -#endif { .procname = "panic_on_warn", .data = &panic_on_warn, @@ -2021,7 +1901,7 @@ static struct ctl_table kern_table[] = { #endif }; -static struct ctl_table vm_table[] = { +static const struct ctl_table vm_table[] = { { .procname = "overcommit_memory", .data = &sysctl_overcommit_memory, diff --git a/kernel/task_work.c b/kernel/task_work.c index c969f1f26be5..d1efec571a4a 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -55,26 +55,14 @@ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; - int flags = notify & TWA_FLAGS; - notify &= ~TWA_FLAGS; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { - /* - * Record the work call stack in order to print it in KASAN - * reports. - * - * Note that stack allocation can fail if TWAF_NO_ALLOC flag - * is set and new page is needed to expand the stack buffer. - */ - if (flags & TWAF_NO_ALLOC) - kasan_record_aux_stack_noalloc(work); - else - kasan_record_aux_stack(work); + kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works); diff --git a/kernel/time/Makefile b/kernel/time/Makefile index fe0ae82124fe..e6e9b85d4db5 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 + +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_sched_clock.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y += time.o timer.o hrtimer.o sleep_timeout.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o timecounter.o alarmtimer.o diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 62e73444ffe4..38dae590b29f 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -137,7 +137,8 @@ static int wdtest_func(void *arg) udelay(1); j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1); - WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC)); + WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC), + "Expected at least 1000ns, got %lu.\n", j2 - j1); /* Verify tsc-like stability with various numbers of errors injected. */ max_retries = clocksource_get_max_watchdog_retry(); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7304d7cf47f2..e0eeacbe2521 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -373,16 +373,18 @@ void clocksource_verify_percpu(struct clocksource *cs) cpumask_clear(&cpus_ahead); cpumask_clear(&cpus_behind); cpus_read_lock(); - preempt_disable(); + migrate_disable(); clocksource_verify_choose_cpus(); if (cpumask_empty(&cpus_chosen)) { - preempt_enable(); + migrate_enable(); cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); return; } testcpu = smp_processor_id(); - pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", + cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + preempt_disable(); for_each_cpu(cpu, &cpus_chosen) { if (cpu == testcpu) continue; @@ -402,6 +404,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cs_nsec_min = cs_nsec; } preempt_enable(); + migrate_enable(); cpus_read_unlock(); if (!cpumask_empty(&cpus_ahead)) pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", @@ -1507,7 +1510,7 @@ static int __init boot_override_clocksource(char* str) { mutex_lock(&clocksource_mutex); if (str) - strscpy(override_name, str, sizeof(override_name)); + strscpy(override_name, str); mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 030426c8c944..22376a1a75b9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -58,6 +58,8 @@ #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) +static void retrigger_next_event(void *arg); + /* * The timer bases: * @@ -111,18 +113,17 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - } + }, + .csd = CSD_INIT(retrigger_next_event, NULL) }; -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, - [CLOCK_TAI] = HRTIMER_BASE_TAI, -}; +static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + else + return likely(base->online); +} /* * Functions and macros which are different for UP/SMP systems are kept in a @@ -145,11 +146,6 @@ static struct hrtimer_cpu_base migration_cpu_base = { #define migration_base migration_cpu_base.clock_base[0] -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return base == &migration_base; -} - /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -183,27 +179,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * We do not migrate the timer when it is expiring before the next - * event on the target cpu. When high resolution is enabled, we cannot - * reprogram the target cpu hardware and we would cause it to fire - * late. To keep it simple, we handle the high resolution enabled and - * disabled case similar. + * Check if the elected target is suitable considering its next + * event and the hotplug state of the current CPU. + * + * If the elected target is remote and its next event is after the timer + * to queue, then a remote reprogram is necessary. However there is no + * guarantee the IPI handling the operation would arrive in time to meet + * the high resolution deadline. In this case the local CPU becomes a + * preferred target, unless it is offline. + * + * High and low resolution modes are handled the same way for simplicity. * * Called with cpu_base->lock of target cpu held. */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, + struct hrtimer_cpu_base *new_cpu_base, + struct hrtimer_cpu_base *this_cpu_base) { ktime_t expires; + /* + * The local CPU clockevent can be reprogrammed. Also get_target_base() + * guarantees it is online. + */ + if (new_cpu_base == this_cpu_base) + return true; + + /* + * The offline local CPU can't be the default target if the + * next remote target event is after this timer. Keep the + * elected new base. An IPI will we issued to reprogram + * it as a last resort. + */ + if (!hrtimer_base_is_online(this_cpu_base)) + return true; + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires < new_base->cpu_base->expires_next; + + return expires >= new_base->cpu_base->expires_next; } -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { + if (!hrtimer_base_is_online(base)) { + int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + return &per_cpu(hrtimer_bases, cpu); + } + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); @@ -254,8 +277,8 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, + this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -264,8 +287,7 @@ again: } WRITE_ONCE(timer->base, new_base); } else { - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { new_cpu_base = this_cpu_base; goto again; } @@ -275,11 +297,6 @@ again: #else /* CONFIG_SMP */ -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return false; -} - static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->cpu_base->lock) @@ -716,8 +733,6 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -static void retrigger_next_event(void *arg); - /* * Switch to high resolution mode */ @@ -1067,11 +1082,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * - * Returns 1 when the new timer is the leftmost timer in the tree. + * Returns true when the new timer is the leftmost timer in the tree. */ -static int enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, - enum hrtimer_mode mode) +static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + enum hrtimer_mode mode) { debug_activate(timer, mode); WARN_ON_ONCE(!base->cpu_base->online); @@ -1206,6 +1220,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { + struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; @@ -1217,10 +1232,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; /* + * Don't force local queuing if this enqueue happens on a unplugged + * CPU after hrtimer_cpu_dying() has been invoked. + */ + force_local &= this_cpu_base->online; + + /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. @@ -1249,8 +1270,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + if (!force_local) { + /* + * If the current CPU base is online, then the timer is + * never queued on a remote CPU if it would be the first + * expiring timer there. + */ + if (hrtimer_base_is_online(this_cpu_base)) + return first; + + /* + * Timer was enqueued remote because the current base is + * already offline. If the timer is the first to expire, + * kick the remote CPU to reprogram the clock event. + */ + if (first) { + struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + + smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); + } + return 0; + } /* * Timer was forced to stay on the current CPU to avoid @@ -1371,6 +1411,18 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, } } +#ifdef CONFIG_SMP +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return base == &migration_base; +} +#else +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return false; +} +#endif + /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was @@ -1525,19 +1577,19 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - if (likely(clock_id < MAX_CLOCKS)) { - int base = hrtimer_clock_to_base_table[clock_id]; - - if (likely(base != HRTIMER_MAX_CLOCK_BASES)) - return base; + switch (clock_id) { + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; + case CLOCK_MONOTONIC: + return HRTIMER_BASE_MONOTONIC; + case CLOCK_BOOTTIME: + return HRTIMER_BASE_BOOTTIME; + case CLOCK_TAI: + return HRTIMER_BASE_TAI; + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; } - WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); - return HRTIMER_BASE_MONOTONIC; -} - -static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) -{ - return HRTIMER_NORESTART; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0775b9ec952a..e3642278df43 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -165,26 +165,26 @@ static struct timens_offset offset_from_ts(struct timespec64 off) * HVCLOCK * VVAR * - * The check for vdso_data->clock_mode is in the unlikely path of + * The check for vdso_clock->clock_mode is in the unlikely path of * the seq begin magic. So for the non-timens case most of the time * 'seq' is even, so the branch is not taken. * * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check - * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the * update to finish and for 'seq' to become even anyway. * - * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which * enforces the time namespace handling path. */ -static void timens_setup_vdso_data(struct vdso_data *vdata, - struct time_namespace *ns) +static void timens_setup_vdso_clock_data(struct vdso_clock *vc, + struct time_namespace *ns) { - struct timens_offset *offset = vdata->offset; + struct timens_offset *offset = vc->offset; struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); - vdata->seq = 1; - vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; + vc->seq = 1; + vc->clock_mode = VDSO_CLOCKMODE_TIMENS; offset[CLOCK_MONOTONIC] = monotonic; offset[CLOCK_MONOTONIC_RAW] = monotonic; offset[CLOCK_MONOTONIC_COARSE] = monotonic; @@ -219,7 +219,8 @@ static DEFINE_MUTEX(offset_lock); static void timens_set_vvar_page(struct task_struct *task, struct time_namespace *ns) { - struct vdso_data *vdata; + struct vdso_time_data *vdata; + struct vdso_clock *vc; unsigned int i; if (ns == &init_time_ns) @@ -235,10 +236,11 @@ static void timens_set_vvar_page(struct task_struct *task, goto out; ns->frozen_offsets = true; - vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + vdata = page_address(ns->vvar_page); + vc = vdata->clock_data; for (i = 0; i < CS_BASES; i++) - timens_setup_vdso_data(&vdata[i], ns); + timens_setup_vdso_clock_data(&vc[i], ns); out: mutex_unlock(&offset_lock); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 163e7a2033b6..b837d3d9d325 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -678,8 +678,7 @@ void ntp_notify_cmos_timer(bool offset_set) static void __init ntp_init_cmos_sync(void) { - hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - sync_hrtimer.function = sync_timer_callback; + hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS); } #else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ static inline void __init ntp_init_cmos_sync(void) { } diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 1af0bb2cc45c..7f4e4fb7381e 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp, return err; } -#ifdef CONFIG_COMPAT -static long posix_clock_compat_ioctl(struct file *fp, - unsigned int cmd, unsigned long arg) -{ - struct posix_clock_context *pccontext = fp->private_data; - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENOTTY; - - if (!clk) - return -ENODEV; - - if (clk->ops.ioctl) - err = clk->ops.ioctl(pccontext, cmd, arg); - - put_posix_clock(clk); - - return err; -} -#endif - static int posix_clock_open(struct inode *inode, struct file *fp) { int err; @@ -171,11 +151,9 @@ static const struct file_operations posix_clock_file_operations = { .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, + .compat_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, -#ifdef CONFIG_COMPAT - .compat_ioctl = posix_clock_compat_ioctl, -#endif }; int posix_clock_register(struct posix_clock *clk, struct device *dev) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 881a9ce96af7..6222112533a7 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -9,28 +9,23 @@ * * These are all the functions necessary to implement POSIX clocks & timers */ -#include <linux/mm.h> +#include <linux/compat.h> +#include <linux/compiler.h> +#include <linux/init.h> +#include <linux/jhash.h> #include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/mutex.h> -#include <linux/sched/task.h> - -#include <linux/uaccess.h> #include <linux/list.h> -#include <linux/init.h> -#include <linux/compiler.h> -#include <linux/hash.h> +#include <linux/memblock.h> +#include <linux/nospec.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> +#include <linux/prctl.h> +#include <linux/sched/task.h> +#include <linux/slab.h> #include <linux/syscalls.h> -#include <linux/wait.h> -#include <linux/workqueue.h> -#include <linux/export.h> -#include <linux/hashtable.h> -#include <linux/compat.h> -#include <linux/nospec.h> +#include <linux/time.h> #include <linux/time_namespace.h> +#include <linux/uaccess.h> #include "timekeeping.h" #include "posix-timers.h" @@ -46,39 +41,65 @@ static struct kmem_cache *posix_timers_cache; * This allows checkpoint/restore to reconstruct the exact timer IDs for * a process. */ -static DEFINE_HASHTABLE(posix_timers_hashtable, 9); -static DEFINE_SPINLOCK(hash_lock); +struct timer_hash_bucket { + spinlock_t lock; + struct hlist_head head; +}; + +static struct { + struct timer_hash_bucket *buckets; + unsigned long mask; +} __timer_data __ro_after_init __aligned(2*sizeof(long)); + +#define timer_buckets (__timer_data.buckets) +#define timer_hashmask (__timer_data.mask) static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; +#define TIMER_ANY_ID INT_MIN + /* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); +static struct k_itimer *__lock_timer(timer_t timer_id); -#define lock_timer(tid, flags) \ -({ struct k_itimer *__timr; \ - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ - __timr; \ +#define lock_timer(tid) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \ + __timr; \ }) -static int hash(struct signal_struct *sig, unsigned int nr) +static inline void unlock_timer(struct k_itimer *timr) { - return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); + if (likely((timr))) + spin_unlock_irq(&timr->it_lock); } -static struct k_itimer *__posix_timers_find(struct hlist_head *head, - struct signal_struct *sig, - timer_t id) +#define scoped_timer_get_or_fail(_id) \ + scoped_cond_guard(lock_timer, return -EINVAL, _id) + +#define scoped_timer (scope) + +DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id); +DEFINE_CLASS_IS_COND_GUARD(lock_timer); + +static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr) { + return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask]; +} + +static struct k_itimer *posix_timer_by_id(timer_t id) +{ + struct signal_struct *sig = current->signal; + struct timer_hash_bucket *bucket = hash_bucket(sig, id); struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { + hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) { /* timer->it_signal can be set concurrently */ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; @@ -86,46 +107,88 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, return NULL; } -static struct k_itimer *posix_timer_by_id(timer_t id) +static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer) { - struct signal_struct *sig = current->signal; - struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + unsigned long val = (unsigned long)timer->it_signal; - return __posix_timers_find(head, sig, id); + /* + * Mask out bit 0, which acts as invalid marker to prevent + * posix_timer_by_id() detecting it as valid. + */ + return (struct signal_struct *)(val & ~1UL); } -static int posix_timer_add(struct k_itimer *timer) +static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig, + timer_t id) { - struct signal_struct *sig = current->signal; - struct hlist_head *head; - unsigned int cnt, id; + struct hlist_head *head = &bucket->head; + struct k_itimer *timer; - /* - * FIXME: Replace this by a per signal struct xarray once there is - * a plan to handle the resulting CRIU regression gracefully. - */ - for (cnt = 0; cnt <= INT_MAX; cnt++) { - spin_lock(&hash_lock); - id = sig->next_posix_timer_id; + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) { + if ((posix_sig_owner(timer) == sig) && (timer->it_id == id)) + return true; + } + return false; +} - /* Write the next ID back. Clamp it to the positive space */ - sig->next_posix_timer_id = (id + 1) & INT_MAX; +static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id) +{ + struct timer_hash_bucket *bucket = hash_bucket(sig, id); - head = &posix_timers_hashtable[hash(sig, id)]; - if (!__posix_timers_find(head, sig, id)) { - hlist_add_head_rcu(&timer->t_hash, head); - spin_unlock(&hash_lock); - return id; + scoped_guard (spinlock, &bucket->lock) { + /* + * Validate under the lock as this could have raced against + * another thread ending up with the same ID, which is + * highly unlikely, but possible. + */ + if (!posix_timer_hashed(bucket, sig, id)) { + /* + * Set the timer ID and the signal pointer to make + * it identifiable in the hash table. The signal + * pointer has bit 0 set to indicate that it is not + * yet fully initialized. posix_timer_hashed() + * masks this bit out, but the syscall lookup fails + * to match due to it being set. This guarantees + * that there can't be duplicate timer IDs handed + * out. + */ + timer->it_id = (timer_t)id; + timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL); + hlist_add_head_rcu(&timer->t_hash, &bucket->head); + return true; } - spin_unlock(&hash_lock); } - /* POSIX return code when no timer ID could be allocated */ - return -EAGAIN; + return false; } -static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +static int posix_timer_add(struct k_itimer *timer, int req_id) { - spin_unlock_irqrestore(&timr->it_lock, flags); + struct signal_struct *sig = current->signal; + + if (unlikely(req_id != TIMER_ANY_ID)) { + if (!posix_timer_add_at(timer, sig, req_id)) + return -EBUSY; + + /* + * Move the ID counter past the requested ID, so that after + * switching back to normal mode the IDs are outside of the + * exact allocated region. That avoids ID collisions on the + * next regular timer_create() invocations. + */ + atomic_set(&sig->next_posix_timer_id, req_id + 1); + return req_id; + } + + for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) { + /* Get the next timer ID and clamp it to positive space */ + unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX; + + if (posix_timer_add_at(timer, sig, id)) + return id; + cond_resched(); + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; } static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) @@ -222,9 +285,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) static __init int init_posix_timers(void) { - posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof(struct k_itimer), 0, - SLAB_PANIC | SLAB_ACCOUNT, NULL); + posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer), + __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL); return 0; } __initcall(init_posix_timers); @@ -259,7 +321,7 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it * since the signal was queued. In either case, don't rearm and * drop the signal. */ - if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal)) + if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr))) return false; if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) @@ -304,6 +366,9 @@ void posix_timer_queue_signal(struct k_itimer *timr) { lockdep_assert_held(&timr->it_lock); + if (!posixtimer_valid(timr)) + return; + timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED; posixtimer_send_sigqueue(timr); } @@ -324,6 +389,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +long posixtimer_create_prctl(unsigned long ctrl) +{ + switch (ctrl) { + case PR_TIMER_CREATE_RESTORE_IDS_OFF: + current->signal->timer_create_restore_ids = 0; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_ON: + current->signal->timer_create_restore_ids = 1; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_GET: + return current->signal->timer_create_restore_ids; + } + return -EINVAL; +} + static struct pid *good_sigevent(sigevent_t * event) { struct pid *pid = task_tgid(current); @@ -350,8 +430,12 @@ static struct pid *good_sigevent(sigevent_t * event) static struct k_itimer *alloc_posix_timer(void) { - struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + struct k_itimer *tmr; + if (unlikely(!posix_timers_cache)) + return NULL; + + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; @@ -373,15 +457,16 @@ void posixtimer_free_timer(struct k_itimer *tmr) static void posix_timer_unhash_and_free(struct k_itimer *tmr) { - spin_lock(&hash_lock); - hlist_del_rcu(&tmr->t_hash); - spin_unlock(&hash_lock); + struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id); + + scoped_guard (spinlock, &bucket->lock) + hlist_del_rcu(&tmr->t_hash); posixtimer_putref(tmr); } static int common_timer_create(struct k_itimer *new_timer) { - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0); return 0; } @@ -390,6 +475,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); + timer_t req_id = TIMER_ANY_ID; struct k_itimer *new_timer; int error, new_timer_id; @@ -404,26 +490,32 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, spin_lock_init(&new_timer->it_lock); + /* Special case for CRIU to restore timers with a given timer ID. */ + if (unlikely(current->signal->timer_create_restore_ids)) { + if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) + return -EFAULT; + /* Valid IDs are 0..INT_MAX */ + if ((unsigned int)req_id > INT_MAX) + return -EINVAL; + } + /* * Add the timer to the hash table. The timer is not yet valid - * because new_timer::it_signal is still NULL. The timer id is also - * not yet visible to user space. + * after insertion, but has a unique ID allocated. */ - new_timer_id = posix_timer_add(new_timer); + new_timer_id = posix_timer_add(new_timer, req_id); if (new_timer_id < 0) { posixtimer_free_timer(new_timer); return new_timer_id; } - new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; new_timer->it_overrun = -1LL; if (event) { - rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(event)); - rcu_read_unlock(); + scoped_guard (rcu) + new_timer->it_pid = get_pid(good_sigevent(event)); if (!new_timer->it_pid) { error = -EINVAL; goto out; @@ -434,7 +526,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; new_timer->sigq.info.si_signo = SIGALRM; - memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t)); new_timer->sigq.info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } @@ -453,7 +544,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, } /* * After succesful copy out, the timer ID is visible to user space - * now but not yet valid because new_timer::signal is still NULL. + * now but not yet valid because new_timer::signal low order bit is 1. * * Complete the initialization with the clock specific create * callback. @@ -462,14 +553,25 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (error) goto out; - spin_lock_irq(¤t->sighand->siglock); - /* This makes the timer valid in the hash table */ - WRITE_ONCE(new_timer->it_signal, current->signal); - hlist_add_head(&new_timer->list, ¤t->signal->posix_timers); - spin_unlock_irq(¤t->sighand->siglock); /* - * After unlocking sighand::siglock @new_timer is subject to - * concurrent removal and cannot be touched anymore + * timer::it_lock ensures that __lock_timer() observes a fully + * initialized timer when it observes a valid timer::it_signal. + * + * sighand::siglock is required to protect signal::posix_timers. + */ + scoped_guard (spinlock_irq, &new_timer->it_lock) { + guard(spinlock)(¤t->sighand->siglock); + /* + * new_timer::it_signal contains the signal pointer with + * bit 0 set, which makes it invalid for syscall operations. + * Store the unmodified signal pointer to make it valid. + */ + WRITE_ONCE(new_timer->it_signal, current->signal); + hlist_add_head_rcu(&new_timer->list, ¤t->signal->posix_timers); + } + /* + * After unlocking @new_timer is subject to concurrent removal and + * cannot be touched anymore */ return 0; out: @@ -507,7 +609,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *__lock_timer(timer_t timer_id) { struct k_itimer *timr; @@ -522,11 +624,11 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * The hash lookup and the timers are RCU protected. * * Timers are added to the hash in invalid state where - * timr::it_signal == NULL. timer::it_signal is only set after the - * rest of the initialization succeeded. + * timr::it_signal is marked invalid. timer::it_signal is only set + * after the rest of the initialization succeeded. * * Timer destruction happens in steps: - * 1) Set timr::it_signal to NULL with timr::it_lock held + * 1) Set timr::it_signal marked invalid with timr::it_lock held * 2) Release timr::it_lock * 3) Remove from the hash under hash_lock * 4) Put the reference count. @@ -538,30 +640,26 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * When the reference count reaches zero, the timer is scheduled * for RCU removal after the grace period. * - * Holding rcu_read_lock() accross the lookup ensures that + * Holding rcu_read_lock() across the lookup ensures that * the timer cannot be freed. * * The lookup validates locklessly that timr::it_signal == * current::it_signal and timr::it_id == @timer_id. timr::it_id - * can't change, but timr::it_signal becomes NULL during - * destruction. + * can't change, but timr::it_signal can become invalid during + * destruction, which makes the locked check fail. */ - rcu_read_lock(); + guard(rcu)(); timr = posix_timer_by_id(timer_id); if (timr) { - spin_lock_irqsave(&timr->it_lock, *flags); + spin_lock_irq(&timr->it_lock); /* * Validate under timr::it_lock that timr::it_signal is * still valid. Pairs with #1 above. */ - if (timr->it_signal == current->signal) { - rcu_read_unlock(); + if (timr->it_signal == current->signal) return timr; - } - spin_unlock_irqrestore(&timr->it_lock, *flags); + spin_unlock_irq(&timr->it_lock); } - rcu_read_unlock(); - return NULL; } @@ -652,24 +750,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int ret = 0; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - memset(setting, 0, sizeof(*setting)); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_get)) - ret = -EINVAL; - else - kc->timer_get(timr, setting); - - unlock_timer(timr, flags); - return ret; + scoped_timer_get_or_fail(timer_id) + scoped_timer->kclock->timer_get(scoped_timer, setting); + return 0; } /* Get the time remaining on a POSIX.1b interval timer. */ @@ -723,18 +807,8 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { - struct k_itimer *timr; - unsigned long flags; - int overrun; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - overrun = timer_overrun_to_int(timr); - unlock_timer(timr, flags); - - return overrun; + scoped_timer_get_or_fail(timer_id) + return timer_overrun_to_int(scoped_timer); } static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, @@ -747,7 +821,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, /* * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the - * hood. See hrtimer_init(). Update timr->kclock, so the generic + * hood. See hrtimer_setup(). Update timr->kclock, so the generic * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might @@ -756,8 +830,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, if (timr->it_clock == CLOCK_REALTIME) timr->kclock = absolute ? &clock_realtime : &clock_monotonic; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; + hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) expires = ktime_add_safe(expires, timer->base->get_time()); @@ -791,26 +864,13 @@ static void common_timer_wait_running(struct k_itimer *timer) * when the task which tries to delete or disarm the timer has preempted * the task which runs the expiry in task work context. */ -static struct k_itimer *timer_wait_running(struct k_itimer *timer, - unsigned long *flags) +static void timer_wait_running(struct k_itimer *timer) { - const struct k_clock *kc = READ_ONCE(timer->kclock); - timer_t timer_id = READ_ONCE(timer->it_id); - - /* Prevent kfree(timer) after dropping the lock */ - rcu_read_lock(); - unlock_timer(timer, *flags); - /* * kc->timer_wait_running() might drop RCU lock. So @timer * cannot be touched anymore after the function returns! */ - if (!WARN_ON_ONCE(!kc->timer_wait_running)) - kc->timer_wait_running(timer); - - rcu_read_unlock(); - /* Relock the timer. It might be not longer hashed. */ - return lock_timer(timer_id, flags); + timer->kclock->timer_wait_running(timer); } /* @@ -865,15 +925,9 @@ int common_timer_set(struct k_itimer *timr, int flags, return 0; } -static int do_timer_settime(timer_t timer_id, int tmr_flags, - struct itimerspec64 *new_spec64, +static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, struct itimerspec64 *old_spec64) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int error; - if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) return -EINVAL; @@ -881,33 +935,28 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); - timr = lock_timer(timer_id, &flags); -retry: - if (!timr) - return -EINVAL; + for (; ; old_spec64 = NULL) { + struct k_itimer *timr; - if (old_spec64) - old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + scoped_timer_get_or_fail(timer_id) { + timr = scoped_timer; - /* Prevent signal delivery and rearming. */ - timr->it_signal_seq++; + if (old_spec64) + old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_set)) - error = -EINVAL; - else - error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); - - if (error == TIMER_RETRY) { - // We already got the old time... - old_spec64 = NULL; - /* Unlocks and relocks the timer if it still exists */ - timr = timer_wait_running(timr, &flags); - goto retry; - } - unlock_timer(timr, flags); + /* Prevent signal delivery and rearming. */ + timr->it_signal_seq++; - return error; + int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64); + if (ret != TIMER_RETRY) + return ret; + + /* Protect the timer from being freed when leaving the lock scope */ + rcu_read_lock(); + } + timer_wait_running(timr); + rcu_read_unlock(); + } } /* Set a POSIX.1b interval timer */ @@ -978,110 +1027,58 @@ static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr) } } -static inline int timer_delete_hook(struct k_itimer *timer) +static void posix_timer_delete(struct k_itimer *timer) { - const struct k_clock *kc = timer->kclock; - - /* Prevent signal delivery and rearming. */ + /* + * Invalidate the timer, remove it from the linked list and remove + * it from the ignored list if pending. + * + * The invalidation must be written with siglock held so that the + * signal code observes the invalidated timer::it_signal in + * do_sigaction(), which prevents it from moving a pending signal + * of a deleted timer to the ignore list. + * + * The invalidation also prevents signal queueing, signal delivery + * and therefore rearming from the signal delivery path. + * + * A concurrent lookup can still find the timer in the hash, but it + * will check timer::it_signal with timer::it_lock held and observe + * bit 0 set, which invalidates it. That also prevents the timer ID + * from being handed out before this timer is completely gone. + */ timer->it_signal_seq++; - if (WARN_ON_ONCE(!kc || !kc->timer_del)) - return -EINVAL; - return kc->timer_del(timer); + scoped_guard (spinlock, ¤t->sighand->siglock) { + unsigned long sig = (unsigned long)timer->it_signal | 1UL; + + WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig); + hlist_del_rcu(&timer->list); + posix_timer_cleanup_ignored(timer); + } + + while (timer->kclock->timer_del(timer) == TIMER_RETRY) { + guard(rcu)(); + spin_unlock_irq(&timer->it_lock); + timer_wait_running(timer); + spin_lock_irq(&timer->it_lock); + } } /* Delete a POSIX.1b interval timer. */ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) { struct k_itimer *timer; - unsigned long flags; - timer = lock_timer(timer_id, &flags); - -retry_delete: - if (!timer) - return -EINVAL; - - if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { - /* Unlocks and relocks the timer if it still exists */ - timer = timer_wait_running(timer, &flags); - goto retry_delete; + scoped_timer_get_or_fail(timer_id) { + timer = scoped_timer; + posix_timer_delete(timer); } - - spin_lock(¤t->sighand->siglock); - hlist_del(&timer->list); - posix_timer_cleanup_ignored(timer); - /* - * A concurrent lookup could check timer::it_signal lockless. It - * will reevaluate with timer::it_lock held and observe the NULL. - * - * It must be written with siglock held so that the signal code - * observes timer->it_signal == NULL in do_sigaction(SIG_IGN), - * which prevents it from moving a pending signal of a deleted - * timer to the ignore list. - */ - WRITE_ONCE(timer->it_signal, NULL); - spin_unlock(¤t->sighand->siglock); - - unlock_timer(timer, flags); + /* Remove it from the hash, which frees up the timer ID */ posix_timer_unhash_and_free(timer); return 0; } /* - * Delete a timer if it is armed, remove it from the hash and schedule it - * for RCU freeing. - */ -static void itimer_delete(struct k_itimer *timer) -{ - unsigned long flags; - - /* - * irqsave is required to make timer_wait_running() work. - */ - spin_lock_irqsave(&timer->it_lock, flags); - -retry_delete: - /* - * Even if the timer is not longer accessible from other tasks - * it still might be armed and queued in the underlying timer - * mechanism. Worse, that timer mechanism might run the expiry - * function concurrently. - */ - if (timer_delete_hook(timer) == TIMER_RETRY) { - /* - * Timer is expired concurrently, prevent livelocks - * and pointless spinning on RT. - * - * timer_wait_running() drops timer::it_lock, which opens - * the possibility for another task to delete the timer. - * - * That's not possible here because this is invoked from - * do_exit() only for the last thread of the thread group. - * So no other task can access and delete that timer. - */ - if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) - return; - - goto retry_delete; - } - hlist_del(&timer->list); - - posix_timer_cleanup_ignored(timer); - - /* - * Setting timer::it_signal to NULL is technically not required - * here as nothing can access the timer anymore legitimately via - * the hash table. Set it to NULL nevertheless so that all deletion - * paths are consistent. - */ - WRITE_ONCE(timer->it_signal, NULL); - - spin_unlock_irqrestore(&timer->it_lock, flags); - posix_timer_unhash_and_free(timer); -} - -/* * Invoked from do_exit() when the last thread of a thread group exits. * At that point no other task can access the timers of the dying * task anymore. @@ -1089,18 +1086,26 @@ retry_delete: void exit_itimers(struct task_struct *tsk) { struct hlist_head timers; + struct hlist_node *next; + struct k_itimer *timer; + + /* Clear restore mode for exec() */ + tsk->signal->timer_create_restore_ids = 0; if (hlist_empty(&tsk->signal->posix_timers)) return; /* Protect against concurrent read via /proc/$PID/timers */ - spin_lock_irq(&tsk->sighand->siglock); - hlist_move_list(&tsk->signal->posix_timers, &timers); - spin_unlock_irq(&tsk->sighand->siglock); + scoped_guard (spinlock_irq, &tsk->sighand->siglock) + hlist_move_list(&tsk->signal->posix_timers, &timers); /* The timers are not longer accessible via tsk::signal */ - while (!hlist_empty(&timers)) - itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); + hlist_for_each_entry_safe(timer, next, &timers, list) { + scoped_guard (spinlock_irq, &timer->it_lock) + posix_timer_delete(timer); + posix_timer_unhash_and_free(timer); + cond_resched(); + } /* * There should be no timers on the ignored list. itimer_delete() has @@ -1545,3 +1550,26 @@ static const struct k_clock *clockid_to_kclock(const clockid_t id) return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } + +static int __init posixtimer_init(void) +{ + unsigned long i, size; + unsigned int shift; + + if (IS_ENABLED(CONFIG_BASE_SMALL)) + size = 512; + else + size = roundup_pow_of_two(512 * num_possible_cpus()); + + timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets), + size, 0, 0, &shift, NULL, size, size); + size = 1UL << shift; + timer_hashmask = size - 1; + + for (i = 0; i < size; i++) { + spin_lock_init(&timer_buckets[i].lock); + INIT_HLIST_HEAD(&timer_buckets[i].head); + } + return 0; +} +core_initcall(posixtimer_init); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fcca4e72f1ef..cc15fe293719 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -263,8 +263,7 @@ void __init generic_sched_clock_init(void) * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - sched_clock_timer.function = sched_clock_poll; + hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index e28f9210f8a1..a88b72b0f35e 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -100,7 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) void tick_setup_hrtimer_broadcast(void) { - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - bctimer.function = bc_handler; + hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ed58eebb4e8f..0207868c8b4d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1020,6 +1020,8 @@ static inline ktime_t tick_get_next_period(void) /** * tick_broadcast_setup_oneshot - setup the broadcast device + * @bc: the broadcast device + * @from_periodic: true if called from periodic mode */ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa058510af9c..c527b421c865 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1573,12 +1573,10 @@ void tick_setup_sched_timer(bool hrtimer) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) { + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) tick_sched_flag_set(ts, TS_FLAG_HIGHRES); - ts->sched_timer.function = tick_nohz_handler; - } /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3d128825d343..929846b8b45a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -485,91 +485,30 @@ u64 notrace ktime_get_tai_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); -static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + * + * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. + */ +u64 ktime_get_real_fast_ns(void) { + struct tk_fast *tkf = &tk_fast_mono; struct tk_read_base *tkr; - u64 basem, baser, delta; + u64 baser, delta; unsigned int seq; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); - if (mono) - *mono = basem + delta; return baser + delta; } - -/** - * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. - * - * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. - */ -u64 ktime_get_real_fast_ns(void) -{ - return __ktime_get_real_fast(&tk_fast_mono, NULL); -} EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** - * ktime_get_fast_timestamps: - NMI safe timestamps - * @snapshot: Pointer to timestamp storage - * - * Stores clock monotonic, boottime and realtime timestamps. - * - * Boot time is a racy access on 32bit systems if the sleep time injection - * happens late during resume and not in timekeeping_resume(). That could - * be avoided by expanding struct tk_read_base with boot offset for 32bit - * and adding more overhead to the update. As this is a hard to observe - * once per resume event which can be filtered with reasonable effort using - * the accurate mono/real timestamps, it's probably not worth the trouble. - * - * Aside of that it might be possible on 32 and 64 bit to observe the - * following when the sleep time injection happens late: - * - * CPU 0 CPU 1 - * timekeeping_resume() - * ktime_get_fast_timestamps() - * mono, real = __ktime_get_real_fast() - * inject_sleep_time() - * update boot offset - * boot = mono + bootoffset; - * - * That means that boot time already has the sleep time adjustment, but - * real time does not. On the next readout both are in sync again. - * - * Preventing this for 64bit is not really feasible without destroying the - * careful cache layout of the timekeeper because the sequence count and - * struct tk_read_base would then need two cache lines instead of one. - * - * Access to the time keeper clock source is disabled across the innermost - * steps of suspend/resume. The accessors still work, but the timestamps - * are frozen until time keeping is resumed which happens very early. - * - * For regular suspend/resume there is no observable difference vs. sched - * clock, but it might affect some of the nasty low level debug printks. - * - * OTOH, access to sched clock is not guaranteed across suspend/resume on - * all systems either so it depends on the hardware in use. - * - * If that turns out to be a real problem then this could be mitigated by - * using sched clock in a similar way as during early boot. But it's not as - * trivial as on early boot because it needs some careful protection - * against the clock monotonic timestamp jumping backwards on resume. - */ -void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono); - snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot)); -} - -/** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * @@ -743,20 +682,19 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act } /** - * timekeeping_forward_now - update clock to the current time + * timekeeping_forward - update clock to given cycle now value * @tk: Pointer to the timekeeper to update + * @cycle_now: Current clocksource read value * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, * as it avoids having to deal with this time offset explicitly. */ -static void timekeeping_forward_now(struct timekeeper *tk) +static void timekeeping_forward(struct timekeeper *tk, u64 cycle_now) { - u64 cycle_now, delta; + u64 delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); - cycle_now = tk_clock_read(&tk->tkr_mono); - delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, - tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; @@ -772,6 +710,21 @@ static void timekeeping_forward_now(struct timekeeper *tk) } /** + * timekeeping_forward_now - update clock to the current time + * @tk: Pointer to the timekeeper to update + * + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. + */ +static void timekeeping_forward_now(struct timekeeper *tk) +{ + u64 cycle_now = tk_clock_read(&tk->tkr_mono); + + timekeeping_forward(tk, cycle_now); +} + +/** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set * @@ -2212,6 +2165,54 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, return offset; } +static u64 timekeeping_accumulate(struct timekeeper *tk, u64 offset, + enum timekeeping_adv_mode mode, + unsigned int *clock_set) +{ + int shift = 0, maxshift; + + /* + * TK_ADV_FREQ indicates that adjtimex(2) directly set the + * frequency or the tick length. + * + * Accumulate the offset, so that the new multiplier starts from + * now. This is required as otherwise for offsets, which are + * smaller than tk::cycle_interval, timekeeping_adjust() could set + * xtime_nsec backwards, which subsequently causes time going + * backwards in the coarse time getters. But even for the case + * where offset is greater than tk::cycle_interval the periodic + * accumulation does not have much value. + * + * Also reset tk::ntp_error as it does not make sense to keep the + * old accumulated error around in this case. + */ + if (mode == TK_ADV_FREQ) { + timekeeping_forward(tk, tk->tkr_mono.cycle_last + offset); + tk->ntp_error = 0; + return 0; + } + + /* + * With NO_HZ we may have to accumulate many cycle_intervals + * (think "ticks") worth of time at once. To do this efficiently, + * we calculate the largest doubling multiple of cycle_intervals + * that is smaller than the offset. We then accumulate that + * chunk in one go, and then try to consume the next smaller + * doubled multiple. + */ + shift = ilog2(offset) - ilog2(tk->cycle_interval); + shift = max(0, shift); + /* Bound shift to one less than what overflows tick_length */ + maxshift = (64 - (ilog2(ntp_tick_length()) + 1)) - 1; + shift = min(shift, maxshift); + while (offset >= tk->cycle_interval) { + offset = logarithmic_accumulation(tk, offset, shift, clock_set); + if (offset < tk->cycle_interval << shift) + shift--; + } + return offset; +} + /* * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length @@ -2221,7 +2222,6 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) struct timekeeper *tk = &tk_core.shadow_timekeeper; struct timekeeper *real_tk = &tk_core.timekeeper; unsigned int clock_set = 0; - int shift = 0, maxshift; u64 offset; guard(raw_spinlock_irqsave)(&tk_core.lock); @@ -2238,24 +2238,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; - /* - * With NO_HZ we may have to accumulate many cycle_intervals - * (think "ticks") worth of time at once. To do this efficiently, - * we calculate the largest doubling multiple of cycle_intervals - * that is smaller than the offset. We then accumulate that - * chunk in one go, and then try to consume the next smaller - * doubled multiple. - */ - shift = ilog2(offset) - ilog2(tk->cycle_interval); - shift = max(0, shift); - /* Bound shift to one less than what overflows tick_length */ - maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; - shift = min(shift, maxshift); - while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift, &clock_set); - if (offset < tk->cycle_interval<<shift) - shift--; - } + offset = timekeeping_accumulate(tk, offset, mode, &clock_set); /* Adjust the multiplier to correct NTP error */ timekeeping_adjust(tk, offset); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5860bf6d16f..c8f776dc6ee0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -301,7 +301,7 @@ static int timer_migration_handler(const struct ctl_table *table, int write, return ret; } -static struct ctl_table timer_sysctl[] = { +static const struct ctl_table timer_sysctl[] = { { .procname = "timer_migration", .data = &sysctl_timer_migration, @@ -956,33 +956,29 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base, static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; - struct timer_base *base; - - base = per_cpu_ptr(&timer_bases[index], cpu); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) - base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); - return base; + index = BASE_DEF; + + return per_cpu_ptr(&timer_bases[index], cpu); } static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; - struct timer_base *base; - - base = this_cpu_ptr(&timer_bases[index]); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) - base = this_cpu_ptr(&timer_bases[BASE_DEF]); - return base; + index = BASE_DEF; + + return this_cpu_ptr(&timer_bases[index]); } static inline struct timer_base *get_timer_base(u32 tflags) diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 1c311c46da50..cfbb46cc4e76 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,7 +46,7 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", @@ -98,7 +98,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 066c9ddca4ec..2f6330831f08 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1670,13 +1670,14 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) * be different from tmigr_hierarchy_levels, contains only a * single group. */ - if (group->parent || i == tmigr_hierarchy_levels || - (list_empty(&tmigr_level_list[i]) && - list_is_singular(&tmigr_level_list[i - 1]))) + if (group->parent || list_is_singular(&tmigr_level_list[i - 1])) break; } while (i < tmigr_hierarchy_levels); + /* Assert single root */ + WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])); + while (i > 0) { group = stack[--i]; @@ -1718,7 +1719,12 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) WARN_ON_ONCE(top == 0); lvllist = &tmigr_level_list[top]; - if (group->num_children == 1 && list_is_singular(lvllist)) { + + /* + * Newly created root level should have accounted the upcoming + * CPU's child group and pre-accounted the old root. + */ + if (group->num_children == 2 && list_is_singular(lvllist)) { /* * The target CPU must never do the prepare work, except * on early boot when the boot CPU is the target. Otherwise diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 154accc7a543..ae19f70f8170 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -110,22 +110,19 @@ struct tmigr_cpu { * union tmigr_state - state of tmigr_group * @state: Combined version of the state - only used for atomic * read/cmpxchg function - * @struct: Split version of the state - only use the struct members to + * &anon struct: Split version of the state - only use the struct members to * update information to stay independent of endianness + * @active: Contains each mask bit of the active children + * @migrator: Contains mask of the child which is migrator + * @seq: Sequence counter needs to be increased when an update + * to the tmigr_state is done. It prevents a race when + * updates in the child groups are propagated in changed + * order. Detailed information about the scenario is + * given in the documentation at the begin of + * timer_migration.c. */ union tmigr_state { u32 state; - /** - * struct - split state of tmigr_group - * @active: Contains each mask bit of the active children - * @migrator: Contains mask of the child which is migrator - * @seq: Sequence counter needs to be increased when an update - * to the tmigr_state is done. It prevents a race when - * updates in the child groups are propagated in changed - * order. Detailed information about the scenario is - * given in the documentation at the begin of - * timer_migration.c. - */ struct { u8 active; u8 migrator; diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 05d383143165..01c2ab1e8971 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,29 +15,29 @@ #include "timekeeping_internal.h" -static inline void update_vdso_data(struct vdso_data *vdata, - struct timekeeper *tk) +static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; + vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; + vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; #endif - vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; + vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask; + vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult; + vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift; + vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; + vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; #endif - vdata[CS_RAW].mask = tk->tkr_raw.mask; - vdata[CS_RAW].mult = tk->tkr_raw.mult; - vdata[CS_RAW].shift = tk->tkr_raw.shift; + vc[CS_RAW].mask = tk->tkr_raw.mask; + vc[CS_RAW].mult = tk->tkr_raw.mult; + vc[CS_RAW].shift = tk->tkr_raw.shift; /* CLOCK_MONOTONIC */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; @@ -55,7 +55,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { @@ -65,19 +65,20 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; @@ -86,21 +87,21 @@ void update_vsyscall(struct timekeeper *tk) vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; - vdata[CS_HRES_COARSE].clock_mode = clock_mode; - vdata[CS_RAW].clock_mode = clock_mode; + vc[CS_HRES_COARSE].clock_mode = clock_mode; + vc[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; /* CLOCK_MONOTONIC_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec = nsec + tk->wall_to_monotonic.tv_nsec; @@ -108,32 +109,31 @@ void update_vsyscall(struct timekeeper *tk) /* * Read without the seqlock held by clock_getres(). - * Note: No need to have a second copy. */ - WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) - update_vdso_data(vdata, tk); + update_vdso_time_data(vdata, tk); __arch_update_vsyscall(vdata); vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } void update_vsyscall_tz(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + vdata->tz_minuteswest = sys_tz.tz_minuteswest; + vdata->tz_dsttime = sys_tz.tz_dsttime; - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } /** @@ -150,7 +150,7 @@ void update_vsyscall_tz(void) */ unsigned long vdso_update_begin(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; unsigned long flags = timekeeper_lock_irqsave(); vdso_write_begin(vdata); @@ -167,9 +167,9 @@ unsigned long vdso_update_begin(void) */ void vdso_update_end(unsigned long flags) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); timekeeper_unlock_irqrestore(flags); } diff --git a/kernel/torture.c b/kernel/torture.c index dede150aef01..3a0a8cc60401 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -792,6 +792,8 @@ static void torture_stutter_cleanup(void) stutter_task = NULL; } +static unsigned long torture_init_jiffies; + static void torture_print_module_parms(void) { @@ -821,6 +823,7 @@ bool torture_init_begin(char *ttype, int v) torture_type = ttype; verbose = v; fullstop = FULLSTOP_DONTSTOP; + WRITE_ONCE(torture_init_jiffies, jiffies); // Lockless reads. torture_print_module_parms(); return true; } @@ -837,6 +840,15 @@ void torture_init_end(void) EXPORT_SYMBOL_GPL(torture_init_end); /* + * Get the torture_init_begin()-time value of the jiffies counter. + */ +unsigned long get_torture_init_jiffies(void) +{ + return READ_ONCE(torture_init_jiffies); +} +EXPORT_SYMBOL_GPL(get_torture_init_jiffies); + +/* * Clean up torture module. Please note that this is -not- invoked via * the usual module_exit() mechanism, but rather by an explicit call from * the client torture module. Returns true if a race with system shutdown diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 74c2b1d43bb9..d570b8b9c0a9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -31,9 +31,14 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.rst -config HAVE_FUNCTION_GRAPH_RETVAL +config HAVE_FUNCTION_GRAPH_FREGS bool +config HAVE_FTRACE_GRAPH_FUNC + bool + help + True if ftrace_graph_func() is defined. + config HAVE_DYNAMIC_FTRACE bool help @@ -57,6 +62,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS This allows for use of ftrace_regs_get_argument() and ftrace_regs_get_stack_pointer(). +config HAVE_FTRACE_REGS_HAVING_PT_REGS + bool + help + If this is set, ftrace_regs has pt_regs, thus it can convert to + pt_regs without allocating memory. + config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE bool help @@ -232,7 +243,7 @@ config FUNCTION_GRAPH_TRACER config FUNCTION_GRAPH_RETVAL bool "Kernel Function Graph Return Value" - depends on HAVE_FUNCTION_GRAPH_RETVAL + depends on HAVE_FUNCTION_GRAPH_FREGS depends on FUNCTION_GRAPH_TRACER default n help @@ -296,10 +307,9 @@ config DYNAMIC_FTRACE_WITH_ARGS config FPROBE bool "Kernel Function Probe (fprobe)" - depends on FUNCTION_TRACER - depends on DYNAMIC_FTRACE_WITH_REGS - depends on HAVE_RETHOOK - select RETHOOK + depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC + depends on DYNAMIC_FTRACE_WITH_ARGS + select FUNCTION_GRAPH_TRACER default n help This option enables kernel function probe (fprobe) based on ftrace. diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1b8db5aee9d3..997fb2a47c92 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -357,17 +357,6 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *bpf_get_probe_write_proto(void) -{ - if (!capable(CAP_SYS_ADMIN)) - return NULL; - - pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", - current->comm, task_pid_nr(current)); - - return &bpf_probe_write_user_proto; -} - #define MAX_TRACE_PRINTK_VARARGS 3 #define BPF_TRACE_PRINTK_SIZE 1024 @@ -619,7 +608,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, - u64 flags, struct perf_sample_data *sd) + u64 flags, struct perf_raw_record *raw, + struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); @@ -644,6 +634,8 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; + perf_sample_save_raw_data(sd, event, raw); + return perf_event_output(event, sd, regs); } @@ -687,9 +679,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, } perf_sample_data_init(sd, 0, 0); - perf_sample_save_raw_data(sd, &raw); - err = __bpf_perf_event_output(regs, map, flags, sd); + err = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_trace_nest_level); preempt_enable(); @@ -748,9 +739,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); - perf_sample_save_raw_data(sd, &raw); - ret = __bpf_perf_event_output(regs, map, flags, sd); + ret = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_event_output_nest_level); preempt_enable(); @@ -853,7 +843,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc if (unlikely(is_global_init(task))) return -EPERM; - if (irqs_disabled()) { + if (!preemptible()) { /* Do an early check on signal validity. Otherwise, * the error is lost in deferred irq_work. */ @@ -1048,27 +1038,14 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_X86_KERNEL_IBT -static unsigned long get_entry_ip(unsigned long fentry_ip) +static inline unsigned long get_entry_ip(unsigned long fentry_ip) { - u32 instr; - - /* We want to be extra safe in case entry ip is on the page edge, - * but otherwise we need to avoid get_kernel_nofault()'s overhead. - */ - if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { - if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) - return fentry_ip; - } else { - instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); - } - if (is_endbr(instr)) +#ifdef CONFIG_X86_KERNEL_IBT + if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; +#endif return fentry_ip; } -#else -#define get_entry_ip(fentry_ip) fentry_ip -#endif BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { @@ -1444,6 +1421,8 @@ late_initcall(bpf_key_sig_kfuncs_init); static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + switch (func_id) { case BPF_FUNC_map_lookup_elem: return &bpf_map_lookup_elem_proto; @@ -1485,9 +1464,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_read_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; - case BPF_FUNC_probe_write_user: - return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ? - NULL : bpf_get_probe_write_proto(); case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: @@ -1566,7 +1542,22 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); default: - return bpf_base_func_proto(func_id, prog); + break; + } + + func_proto = bpf_base_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + if (!bpf_token_capable(prog->aux->token, CAP_SYS_ADMIN)) + return NULL; + + switch (func_id) { + case BPF_FUNC_probe_write_user: + return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ? + NULL : &bpf_probe_write_user_proto; + default: + return NULL; } } @@ -2242,6 +2233,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) { struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; + struct bpf_prog *prog = NULL; int ret; mutex_lock(&bpf_event_mutex); @@ -2262,18 +2254,22 @@ void perf_event_detach_bpf_prog(struct perf_event *event) } put: - /* - * It could be that the bpf_prog is not sleepable (and will be freed - * via normal RCU), but is called from a point that supports sleepable - * programs and uses tasks-trace-RCU. - */ - synchronize_rcu_tasks_trace(); - - bpf_prog_put(event->prog); + prog = event->prog; event->prog = NULL; unlock: mutex_unlock(&bpf_event_mutex); + + if (prog) { + /* + * It could be that the bpf_prog is not sleepable (and will be freed + * via normal RCU), but is called from a point that supports sleepable + * programs and uses tasks-trace-RCU. + */ + synchronize_rcu_tasks_trace(); + + bpf_prog_put(prog); + } } int perf_event_query_prog_array(struct perf_event *event, void __user *info) @@ -2584,6 +2580,20 @@ struct user_syms { char *buf; }; +#ifndef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS +static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs); +#define bpf_kprobe_multi_pt_regs_ptr() this_cpu_ptr(&bpf_kprobe_multi_pt_regs) +#else +#define bpf_kprobe_multi_pt_regs_ptr() (NULL) +#endif + +static unsigned long ftrace_get_entry_ip(unsigned long fentry_ip) +{ + unsigned long ip = ftrace_get_symaddr(fentry_ip); + + return ip ? : fentry_ip; +} + static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt) { unsigned long __user usymbol; @@ -2778,7 +2788,7 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) static int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, - unsigned long entry_ip, struct pt_regs *regs, + unsigned long entry_ip, struct ftrace_regs *fregs, bool is_return, void *data) { struct bpf_kprobe_multi_run_ctx run_ctx = { @@ -2790,16 +2800,18 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, .entry_ip = entry_ip, }; struct bpf_run_ctx *old_run_ctx; + struct pt_regs *regs; int err; if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { bpf_prog_inc_misses_counter(link->link.prog); - err = 0; + err = 1; goto out; } migrate_disable(); rcu_read_lock(); + regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr()); old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); @@ -2813,26 +2825,28 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, static int kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *data) { struct bpf_kprobe_multi_link *link; int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false, data); + err = kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip), + fregs, false, data); return is_kprobe_session(link->link.prog) ? err : 0; } static void kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *data) { struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true, data); + kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip), + fregs, true, data); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 30e3ddc8a8a8..5dddfc2149f6 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -292,13 +292,15 @@ static inline unsigned long make_data_type_val(int idx, int size, int offset) } /* ftrace_graph_entry set to this to tell some archs to run function graph */ -static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops) +static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops, + struct ftrace_regs *fregs) { return 0; } /* ftrace_graph_return set to this to tell some archs to run function graph */ -static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops) +static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops, + struct ftrace_regs *fregs) { } @@ -520,13 +522,15 @@ int __weak ftrace_disable_ftrace_graph_caller(void) #endif int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { return 0; } static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { } @@ -644,14 +648,20 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, #endif /* If the caller does not use ftrace, call this function. */ -int function_graph_enter(unsigned long ret, unsigned long func, - unsigned long frame_pointer, unsigned long *retp) +int function_graph_enter_regs(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp, + struct ftrace_regs *fregs) { struct ftrace_graph_ent trace; unsigned long bitmap = 0; int offset; + int bit; int i; + bit = ftrace_test_recursion_trylock(func, ret); + if (bit < 0) + return -EBUSY; + trace.func = func; trace.depth = ++current->curr_ret_depth; @@ -663,7 +673,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, if (static_branch_likely(&fgraph_do_direct)) { int save_curr_ret_stack = current->curr_ret_stack; - if (static_call(fgraph_func)(&trace, fgraph_direct_gops)) + if (static_call(fgraph_func)(&trace, fgraph_direct_gops, fregs)) bitmap |= BIT(fgraph_direct_gops->idx); else /* Clear out any saved storage */ @@ -681,7 +691,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, save_curr_ret_stack = current->curr_ret_stack; if (ftrace_ops_test(&gops->ops, func, NULL) && - gops->entryfunc(&trace, gops)) + gops->entryfunc(&trace, gops, fregs)) bitmap |= BIT(i); else /* Clear out any saved storage */ @@ -697,12 +707,13 @@ int function_graph_enter(unsigned long ret, unsigned long func, * flag, set that bit always. */ set_bitmap(current, offset, bitmap | BIT(0)); - + ftrace_test_recursion_unlock(bit); return 0; out_ret: current->curr_ret_stack -= FGRAPH_FRAME_OFFSET + 1; out: current->curr_ret_depth--; + ftrace_test_recursion_unlock(bit); return -EBUSY; } @@ -792,15 +803,12 @@ static struct notifier_block ftrace_suspend_notifier = { .notifier_call = ftrace_suspend_notifier_call, }; -/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */ -struct fgraph_ret_regs; - /* * Send the trace to the ring-buffer. * @return the original return address. */ -static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs, - unsigned long frame_pointer) +static inline unsigned long +__ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointer) { struct ftrace_ret_stack *ret_stack; struct ftrace_graph_ret trace; @@ -818,9 +826,11 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs return (unsigned long)panic; } - trace.rettime = trace_clock_local(); + if (fregs) + ftrace_regs_set_instruction_pointer(fregs, ret); + #ifdef CONFIG_FUNCTION_GRAPH_RETVAL - trace.retval = fgraph_ret_regs_return_value(ret_regs); + trace.retval = ftrace_regs_get_return_value(fregs); #endif bitmap = get_bitmap_bits(current, offset); @@ -828,7 +838,7 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs #ifdef CONFIG_HAVE_STATIC_CALL if (static_branch_likely(&fgraph_do_direct)) { if (test_bit(fgraph_direct_gops->idx, &bitmap)) - static_call(fgraph_retfunc)(&trace, fgraph_direct_gops); + static_call(fgraph_retfunc)(&trace, fgraph_direct_gops, fregs); } else #endif { @@ -838,7 +848,7 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs if (gops == &fgraph_stub) continue; - gops->retfunc(&trace, gops); + gops->retfunc(&trace, gops, fregs); } } @@ -855,14 +865,14 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs } /* - * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can - * leave only ftrace_return_to_handler(ret_regs). + * After all architecures have selected HAVE_FUNCTION_GRAPH_FREGS, we can + * leave only ftrace_return_to_handler(fregs). */ -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL -unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs) +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS +unsigned long ftrace_return_to_handler(struct ftrace_regs *fregs) { - return __ftrace_return_to_handler(ret_regs, - fgraph_ret_regs_frame_pointer(ret_regs)); + return __ftrace_return_to_handler(fregs, + ftrace_regs_get_frame_pointer(fregs)); } #else unsigned long ftrace_return_to_handler(unsigned long frame_pointer) @@ -1010,7 +1020,8 @@ void ftrace_graph_sleep_time_control(bool enable) * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h */ -void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops); +void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); /* The callbacks that hook a function */ trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph; @@ -1174,7 +1185,8 @@ void ftrace_graph_exit_task(struct task_struct *t) #ifdef CONFIG_DYNAMIC_FTRACE static int fgraph_pid_func(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = gops->ops.private; int pid; @@ -1188,7 +1200,7 @@ static int fgraph_pid_func(struct ftrace_graph_ent *trace, return 0; } - return gops->saved_func(trace, gops); + return gops->saved_func(trace, gops, fregs); } void fgraph_update_pid_func(void) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 9ff018245840..33082c4e8154 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -8,98 +8,224 @@ #include <linux/fprobe.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> -#include <linux/rethook.h> +#include <linux/list.h> +#include <linux/mutex.h> #include <linux/slab.h> #include <linux/sort.h> +#include <asm/fprobe.h> + #include "trace.h" -struct fprobe_rethook_node { - struct rethook_node node; - unsigned long entry_ip; - unsigned long entry_parent_ip; - char data[]; -}; +#define FPROBE_IP_HASH_BITS 8 +#define FPROBE_IP_TABLE_SIZE (1 << FPROBE_IP_HASH_BITS) -static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) -{ - struct fprobe_rethook_node *fpr; - struct rethook_node *rh = NULL; - struct fprobe *fp; - void *entry_data = NULL; - int ret = 0; +#define FPROBE_HASH_BITS 6 +#define FPROBE_TABLE_SIZE (1 << FPROBE_HASH_BITS) - fp = container_of(ops, struct fprobe, ops); +#define SIZE_IN_LONG(x) ((x + sizeof(long) - 1) >> (sizeof(long) == 8 ? 3 : 2)) - if (fp->exit_handler) { - rh = rethook_try_get(fp->rethook); - if (!rh) { - fp->nmissed++; - return; - } - fpr = container_of(rh, struct fprobe_rethook_node, node); - fpr->entry_ip = ip; - fpr->entry_parent_ip = parent_ip; - if (fp->entry_data_size) - entry_data = fpr->data; +/* + * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still + * exists. The key is the address of fprobe instance. + * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe + * instance related to the funciton address. The key is the ftrace IP + * address. + * + * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp + * are set NULL and delete those from both hash tables (by hlist_del_rcu). + * After an RCU grace period, the fprobe_hlist itself will be released. + * + * fprobe_table and fprobe_ip_table can be accessed from either + * - Normal hlist traversal and RCU add/del under 'fprobe_mutex' is held. + * - RCU hlist traversal under disabling preempt + */ +static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE]; +static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE]; +static DEFINE_MUTEX(fprobe_mutex); + +/* + * Find first fprobe in the hlist. It will be iterated twice in the entry + * probe, once for correcting the total required size, the second time is + * calling back the user handlers. + * Thus the hlist in the fprobe_table must be sorted and new probe needs to + * be added *before* the first fprobe. + */ +static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip) +{ + struct fprobe_hlist_node *node; + struct hlist_head *head; + + head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; + hlist_for_each_entry_rcu(node, head, hlist, + lockdep_is_held(&fprobe_mutex)) { + if (node->addr == ip) + return node; } + return NULL; +} +NOKPROBE_SYMBOL(find_first_fprobe_node); + +/* Node insertion and deletion requires the fprobe_mutex */ +static void insert_fprobe_node(struct fprobe_hlist_node *node) +{ + unsigned long ip = node->addr; + struct fprobe_hlist_node *next; + struct hlist_head *head; - if (fp->entry_handler) - ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data); + lockdep_assert_held(&fprobe_mutex); - /* If entry_handler returns !0, nmissed is not counted. */ - if (rh) { - if (ret) - rethook_recycle(rh); - else - rethook_hook(rh, ftrace_get_regs(fregs), true); + next = find_first_fprobe_node(ip); + if (next) { + hlist_add_before_rcu(&node->hlist, &next->hlist); + return; } + head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; + hlist_add_head_rcu(&node->hlist, head); } -static void fprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) +/* Return true if there are synonims */ +static bool delete_fprobe_node(struct fprobe_hlist_node *node) { - struct fprobe *fp; - int bit; + lockdep_assert_held(&fprobe_mutex); - fp = container_of(ops, struct fprobe, ops); - if (fprobe_disabled(fp)) - return; + WRITE_ONCE(node->fp, NULL); + hlist_del_rcu(&node->hlist); + return !!find_first_fprobe_node(node->addr); +} - /* recursion detection has to go before any traceable function and - * all functions before this point should be marked as notrace - */ - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) { - fp->nmissed++; - return; +/* Check existence of the fprobe */ +static bool is_fprobe_still_exist(struct fprobe *fp) +{ + struct hlist_head *head; + struct fprobe_hlist *fph; + + head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)]; + hlist_for_each_entry_rcu(fph, head, hlist, + lockdep_is_held(&fprobe_mutex)) { + if (fph->fp == fp) + return true; } - __fprobe_handler(ip, parent_ip, ops, fregs); - ftrace_test_recursion_unlock(bit); + return false; +} +NOKPROBE_SYMBOL(is_fprobe_still_exist); + +static int add_fprobe_hash(struct fprobe *fp) +{ + struct fprobe_hlist *fph = fp->hlist_array; + struct hlist_head *head; + + lockdep_assert_held(&fprobe_mutex); + + if (WARN_ON_ONCE(!fph)) + return -EINVAL; + + if (is_fprobe_still_exist(fp)) + return -EEXIST; + + head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)]; + hlist_add_head_rcu(&fp->hlist_array->hlist, head); + return 0; +} + +static int del_fprobe_hash(struct fprobe *fp) +{ + struct fprobe_hlist *fph = fp->hlist_array; + + lockdep_assert_held(&fprobe_mutex); + + if (WARN_ON_ONCE(!fph)) + return -EINVAL; + + if (!is_fprobe_still_exist(fp)) + return -ENOENT; + + fph->fp = NULL; + hlist_del_rcu(&fph->hlist); + return 0; +} + +#ifdef ARCH_DEFINE_ENCODE_FPROBE_HEADER + +/* The arch should encode fprobe_header info into one unsigned long */ +#define FPROBE_HEADER_SIZE_IN_LONG 1 + +static inline bool write_fprobe_header(unsigned long *stack, + struct fprobe *fp, unsigned int size_words) +{ + if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD || + !arch_fprobe_header_encodable(fp))) + return false; + *stack = arch_encode_fprobe_header(fp, size_words); + return true; } -NOKPROBE_SYMBOL(fprobe_handler); -static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) +static inline void read_fprobe_header(unsigned long *stack, + struct fprobe **fp, unsigned int *size_words) { + *fp = arch_decode_fprobe_header_fp(*stack); + *size_words = arch_decode_fprobe_header_size(*stack); +} + +#else + +/* Generic fprobe_header */ +struct __fprobe_header { struct fprobe *fp; - int bit; + unsigned long size_words; +} __packed; - fp = container_of(ops, struct fprobe, ops); - if (fprobe_disabled(fp)) - return; +#define FPROBE_HEADER_SIZE_IN_LONG SIZE_IN_LONG(sizeof(struct __fprobe_header)) - /* recursion detection has to go before any traceable function and - * all functions called before this point should be marked as notrace - */ - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) { - fp->nmissed++; - return; - } +static inline bool write_fprobe_header(unsigned long *stack, + struct fprobe *fp, unsigned int size_words) +{ + struct __fprobe_header *fph = (struct __fprobe_header *)stack; + + if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD)) + return false; + + fph->fp = fp; + fph->size_words = size_words; + return true; +} + +static inline void read_fprobe_header(unsigned long *stack, + struct fprobe **fp, unsigned int *size_words) +{ + struct __fprobe_header *fph = (struct __fprobe_header *)stack; + + *fp = fph->fp; + *size_words = fph->size_words; +} + +#endif + +/* + * fprobe shadow stack management: + * Since fprobe shares a single fgraph_ops, it needs to share the stack entry + * among the probes on the same function exit. Note that a new probe can be + * registered before a target function is returning, we can not use the hash + * table to find the corresponding probes. Thus the probe address is stored on + * the shadow stack with its entry data size. + * + */ +static inline int __fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct fprobe *fp, struct ftrace_regs *fregs, + void *data) +{ + if (!fp->entry_handler) + return 0; + return fp->entry_handler(fp, ip, parent_ip, fregs, data); +} + +static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, + struct fprobe *fp, struct ftrace_regs *fregs, + void *data) +{ + int ret; /* * This user handler is shared with other kprobes and is not expected to be * called recursively. So if any other kprobe handler is running, this will @@ -108,44 +234,182 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, */ if (unlikely(kprobe_running())) { fp->nmissed++; - goto recursion_unlock; + return 0; } kprobe_busy_begin(); - __fprobe_handler(ip, parent_ip, ops, fregs); + ret = __fprobe_handler(ip, parent_ip, fp, fregs, data); kprobe_busy_end(); - -recursion_unlock: - ftrace_test_recursion_unlock(bit); + return ret; } -static void fprobe_exit_handler(struct rethook_node *rh, void *data, - unsigned long ret_ip, struct pt_regs *regs) +static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs) { - struct fprobe *fp = (struct fprobe *)data; - struct fprobe_rethook_node *fpr; - int bit; + struct fprobe_hlist_node *node, *first; + unsigned long *fgraph_data = NULL; + unsigned long func = trace->func; + unsigned long ret_ip; + int reserved_words; + struct fprobe *fp; + int used, ret; - if (!fp || fprobe_disabled(fp)) - return; + if (WARN_ON_ONCE(!fregs)) + return 0; + + first = node = find_first_fprobe_node(func); + if (unlikely(!first)) + return 0; - fpr = container_of(rh, struct fprobe_rethook_node, node); + reserved_words = 0; + hlist_for_each_entry_from_rcu(node, hlist) { + if (node->addr != func) + break; + fp = READ_ONCE(node->fp); + if (!fp || !fp->exit_handler) + continue; + /* + * Since fprobe can be enabled until the next loop, we ignore the + * fprobe's disabled flag in this loop. + */ + reserved_words += + FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size); + } + node = first; + if (reserved_words) { + fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long)); + if (unlikely(!fgraph_data)) { + hlist_for_each_entry_from_rcu(node, hlist) { + if (node->addr != func) + break; + fp = READ_ONCE(node->fp); + if (fp && !fprobe_disabled(fp)) + fp->nmissed++; + } + return 0; + } + } /* - * we need to assure no calls to traceable functions in-between the - * end of fprobe_handler and the beginning of fprobe_exit_handler. + * TODO: recursion detection has been done in the fgraph. Thus we need + * to add a callback to increment missed counter. */ - bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip); - if (bit < 0) { - fp->nmissed++; + ret_ip = ftrace_regs_get_return_address(fregs); + used = 0; + hlist_for_each_entry_from_rcu(node, hlist) { + int data_size; + void *data; + + if (node->addr != func) + break; + fp = READ_ONCE(node->fp); + if (!fp || fprobe_disabled(fp)) + continue; + + data_size = fp->entry_data_size; + if (data_size && fp->exit_handler) + data = fgraph_data + used + FPROBE_HEADER_SIZE_IN_LONG; + else + data = NULL; + + if (fprobe_shared_with_kprobes(fp)) + ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data); + else + ret = __fprobe_handler(func, ret_ip, fp, fregs, data); + + /* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */ + if (!ret && fp->exit_handler) { + int size_words = SIZE_IN_LONG(data_size); + + if (write_fprobe_header(&fgraph_data[used], fp, size_words)) + used += FPROBE_HEADER_SIZE_IN_LONG + size_words; + } + } + if (used < reserved_words) + memset(fgraph_data + used, 0, reserved_words - used); + + /* If any exit_handler is set, data must be used. */ + return used != 0; +} +NOKPROBE_SYMBOL(fprobe_entry); + +static void fprobe_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + unsigned long *fgraph_data = NULL; + unsigned long ret_ip; + struct fprobe *fp; + int size, curr; + int size_words; + + fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size); + if (WARN_ON_ONCE(!fgraph_data)) return; + size_words = SIZE_IN_LONG(size); + ret_ip = ftrace_regs_get_instruction_pointer(fregs); + + preempt_disable(); + + curr = 0; + while (size_words > curr) { + read_fprobe_header(&fgraph_data[curr], &fp, &size); + if (!fp) + break; + curr += FPROBE_HEADER_SIZE_IN_LONG; + if (is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) { + if (WARN_ON_ONCE(curr + size > size_words)) + break; + fp->exit_handler(fp, trace->func, ret_ip, fregs, + size ? fgraph_data + curr : NULL); + } + curr += size; + } + preempt_enable(); +} +NOKPROBE_SYMBOL(fprobe_return); + +static struct fgraph_ops fprobe_graph_ops = { + .entryfunc = fprobe_entry, + .retfunc = fprobe_return, +}; +static int fprobe_graph_active; + +/* Add @addrs to the ftrace filter and register fgraph if needed. */ +static int fprobe_graph_add_ips(unsigned long *addrs, int num) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0); + if (ret) + return ret; + + if (!fprobe_graph_active) { + ret = register_ftrace_graph(&fprobe_graph_ops); + if (WARN_ON_ONCE(ret)) { + ftrace_free_filter(&fprobe_graph_ops.ops); + return ret; + } } + fprobe_graph_active++; + return 0; +} - fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs, - fp->entry_data_size ? (void *)fpr->data : NULL); - ftrace_test_recursion_unlock(bit); +/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */ +static void fprobe_graph_remove_ips(unsigned long *addrs, int num) +{ + lockdep_assert_held(&fprobe_mutex); + + fprobe_graph_active--; + /* Q: should we unregister it ? */ + if (!fprobe_graph_active) + unregister_ftrace_graph(&fprobe_graph_ops); + + if (num) + ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); } -NOKPROBE_SYMBOL(fprobe_exit_handler); static int symbols_cmp(const void *a, const void *b) { @@ -175,53 +439,97 @@ static unsigned long *get_ftrace_locations(const char **syms, int num) return ERR_PTR(-ENOENT); } -static void fprobe_init(struct fprobe *fp) -{ - fp->nmissed = 0; - if (fprobe_shared_with_kprobes(fp)) - fp->ops.func = fprobe_kprobe_handler; - else - fp->ops.func = fprobe_handler; - fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS; -} +struct filter_match_data { + const char *filter; + const char *notfilter; + size_t index; + size_t size; + unsigned long *addrs; +}; -static int fprobe_init_rethook(struct fprobe *fp, int num) +static int filter_match_callback(void *data, const char *name, unsigned long addr) { - int size; + struct filter_match_data *match = data; - if (!fp->exit_handler) { - fp->rethook = NULL; + if (!glob_match(match->filter, name) || + (match->notfilter && glob_match(match->notfilter, name))) return 0; - } - /* Initialize rethook if needed */ - if (fp->nr_maxactive) - num = fp->nr_maxactive; - else - num *= num_possible_cpus() * 2; - if (num <= 0) - return -EINVAL; + if (!ftrace_location(addr)) + return 0; + + if (match->addrs) + match->addrs[match->index] = addr; - size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size; + match->index++; + return match->index == match->size; +} - /* Initialize rethook */ - fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num); - if (IS_ERR(fp->rethook)) - return PTR_ERR(fp->rethook); +/* + * Make IP list from the filter/no-filter glob patterns. + * Return the number of matched symbols, or -ENOENT. + */ +static int ip_list_from_filter(const char *filter, const char *notfilter, + unsigned long *addrs, size_t size) +{ + struct filter_match_data match = { .filter = filter, .notfilter = notfilter, + .index = 0, .size = size, .addrs = addrs}; + int ret; - return 0; + ret = kallsyms_on_each_symbol(filter_match_callback, &match); + if (ret < 0) + return ret; + ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match); + if (ret < 0) + return ret; + + return match.index ?: -ENOENT; } static void fprobe_fail_cleanup(struct fprobe *fp) { - if (!IS_ERR_OR_NULL(fp->rethook)) { - /* Don't need to cleanup rethook->handler because this is not used. */ - rethook_free(fp->rethook); - fp->rethook = NULL; + kfree(fp->hlist_array); + fp->hlist_array = NULL; +} + +/* Initialize the fprobe data structure. */ +static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) +{ + struct fprobe_hlist *hlist_array; + unsigned long addr; + int size, i; + + if (!fp || !addrs || num <= 0) + return -EINVAL; + + size = ALIGN(fp->entry_data_size, sizeof(long)); + if (size > MAX_FPROBE_DATA_SIZE) + return -E2BIG; + fp->entry_data_size = size; + + hlist_array = kzalloc(struct_size(hlist_array, array, num), GFP_KERNEL); + if (!hlist_array) + return -ENOMEM; + + fp->nmissed = 0; + + hlist_array->size = num; + fp->hlist_array = hlist_array; + hlist_array->fp = fp; + for (i = 0; i < num; i++) { + hlist_array->array[i].fp = fp; + addr = ftrace_location(addrs[i]); + if (!addr) { + fprobe_fail_cleanup(fp); + return -ENOENT; + } + hlist_array->array[i].addr = addr; } - ftrace_free_filter(&fp->ops); + return 0; } +#define FPROBE_IPS_MAX INT_MAX + /** * register_fprobe() - Register fprobe to ftrace by pattern. * @fp: A fprobe data structure to be registered. @@ -235,46 +543,24 @@ static void fprobe_fail_cleanup(struct fprobe *fp) */ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) { - struct ftrace_hash *hash; - unsigned char *str; - int ret, len; + unsigned long *addrs; + int ret; if (!fp || !filter) return -EINVAL; - fprobe_init(fp); - - len = strlen(filter); - str = kstrdup(filter, GFP_KERNEL); - ret = ftrace_set_filter(&fp->ops, str, len, 0); - kfree(str); - if (ret) + ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX); + if (ret < 0) return ret; - if (notfilter) { - len = strlen(notfilter); - str = kstrdup(notfilter, GFP_KERNEL); - ret = ftrace_set_notrace(&fp->ops, str, len, 0); - kfree(str); - if (ret) - goto out; - } - - /* TODO: - * correctly calculate the total number of filtered symbols - * from both filter and notfilter. - */ - hash = rcu_access_pointer(fp->ops.local_hash.filter_hash); - if (WARN_ON_ONCE(!hash)) - goto out; - - ret = fprobe_init_rethook(fp, (int)hash->count); - if (!ret) - ret = register_ftrace_function(&fp->ops); + addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + ret = ip_list_from_filter(filter, notfilter, addrs, ret); + if (ret > 0) + ret = register_fprobe_ips(fp, addrs, ret); -out: - if (ret) - fprobe_fail_cleanup(fp); + kfree(addrs); return ret; } EXPORT_SYMBOL_GPL(register_fprobe); @@ -282,7 +568,7 @@ EXPORT_SYMBOL_GPL(register_fprobe); /** * register_fprobe_ips() - Register fprobe to ftrace by address. * @fp: A fprobe data structure to be registered. - * @addrs: An array of target ftrace location addresses. + * @addrs: An array of target function address. * @num: The number of entries of @addrs. * * Register @fp to ftrace for enabling the probe on the address given by @addrs. @@ -294,23 +580,27 @@ EXPORT_SYMBOL_GPL(register_fprobe); */ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) { - int ret; - - if (!fp || !addrs || num <= 0) - return -EINVAL; - - fprobe_init(fp); + struct fprobe_hlist *hlist_array; + int ret, i; - ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0); + ret = fprobe_init(fp, addrs, num); if (ret) return ret; - ret = fprobe_init_rethook(fp, num); - if (!ret) - ret = register_ftrace_function(&fp->ops); + mutex_lock(&fprobe_mutex); + + hlist_array = fp->hlist_array; + ret = fprobe_graph_add_ips(addrs, num); + if (!ret) { + add_fprobe_hash(fp); + for (i = 0; i < hlist_array->size; i++) + insert_fprobe_node(&hlist_array->array[i]); + } + mutex_unlock(&fprobe_mutex); if (ret) fprobe_fail_cleanup(fp); + return ret; } EXPORT_SYMBOL_GPL(register_fprobe_ips); @@ -348,14 +638,13 @@ EXPORT_SYMBOL_GPL(register_fprobe_syms); bool fprobe_is_registered(struct fprobe *fp) { - if (!fp || (fp->ops.saved_func != fprobe_handler && - fp->ops.saved_func != fprobe_kprobe_handler)) + if (!fp || !fp->hlist_array) return false; return true; } /** - * unregister_fprobe() - Unregister fprobe from ftrace + * unregister_fprobe() - Unregister fprobe. * @fp: A fprobe data structure to be unregistered. * * Unregister fprobe (and remove ftrace hooks from the function entries). @@ -364,23 +653,40 @@ bool fprobe_is_registered(struct fprobe *fp) */ int unregister_fprobe(struct fprobe *fp) { - int ret; + struct fprobe_hlist *hlist_array; + unsigned long *addrs = NULL; + int ret = 0, i, count; - if (!fprobe_is_registered(fp)) - return -EINVAL; + mutex_lock(&fprobe_mutex); + if (!fp || !is_fprobe_still_exist(fp)) { + ret = -EINVAL; + goto out; + } + + hlist_array = fp->hlist_array; + addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL); + if (!addrs) { + ret = -ENOMEM; /* TODO: Fallback to one-by-one loop */ + goto out; + } - if (!IS_ERR_OR_NULL(fp->rethook)) - rethook_stop(fp->rethook); + /* Remove non-synonim ips from table and hash */ + count = 0; + for (i = 0; i < hlist_array->size; i++) { + if (!delete_fprobe_node(&hlist_array->array[i])) + addrs[count++] = hlist_array->array[i].addr; + } + del_fprobe_hash(fp); - ret = unregister_ftrace_function(&fp->ops); - if (ret < 0) - return ret; + fprobe_graph_remove_ips(addrs, count); - if (!IS_ERR_OR_NULL(fp->rethook)) - rethook_free(fp->rethook); + kfree_rcu(hlist_array, rcu); + fp->hlist_array = NULL; - ftrace_free_filter(&fp->ops); +out: + mutex_unlock(&fprobe_mutex); + kfree(addrs); return ret; } EXPORT_SYMBOL_GPL(unregister_fprobe); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2e113f8b13a2..fc88e0688daf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -536,24 +536,22 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; - int ret = 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER static struct trace_seq s; unsigned long long avg; unsigned long long stddev; + unsigned long long stddev_denom; #endif - mutex_lock(&ftrace_profile_lock); + guard(mutex)(&ftrace_profile_lock); /* we raced with function_profile_reset() */ - if (unlikely(rec->counter == 0)) { - ret = -EBUSY; - goto out; - } + if (unlikely(rec->counter == 0)) + return -EBUSY; #ifdef CONFIG_FUNCTION_GRAPH_TRACER avg = div64_ul(rec->time, rec->counter); if (tracing_thresh && (avg < tracing_thresh)) - goto out; + return 0; #endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); @@ -562,23 +560,19 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_puts(m, " "); - /* Sample standard deviation (s^2) */ - if (rec->counter <= 1) - stddev = 0; - else { - /* - * Apply Welford's method: - * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) - */ + /* + * Variance formula: + * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) + * Maybe Welford's method is better here? + * Divide only by 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide by 1000 again. + */ + stddev = 0; + stddev_denom = rec->counter * (rec->counter - 1) * 1000; + if (stddev_denom) { stddev = rec->counter * rec->time_squared - rec->time * rec->time; - - /* - * Divide only 1000 for ns^2 -> us^2 conversion. - * trace_print_graph_duration will divide 1000 again. - */ - stddev = div64_ul(stddev, - rec->counter * (rec->counter - 1) * 1000); + stddev = div64_ul(stddev, stddev_denom); } trace_seq_init(&s); @@ -590,10 +584,8 @@ static int function_stat_show(struct seq_file *m, void *v) trace_print_seq(m, &s); #endif seq_putc(m, '\n'); -out: - mutex_unlock(&ftrace_profile_lock); - return ret; + return 0; } static void ftrace_profile_reset(struct ftrace_profile_stat *stat) @@ -789,27 +781,24 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; - unsigned long flags; if (!ftrace_profile_enabled) return; - local_irq_save(flags); + guard(preempt_notrace)(); stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) - goto out; + return; rec = ftrace_find_profiled_func(stat, ip); if (!rec) { rec = ftrace_profile_alloc(stat, ip); if (!rec) - goto out; + return; } rec->counter++; - out: - local_irq_restore(flags); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -827,7 +816,8 @@ struct profile_fgraph_data { }; static int profile_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct profile_fgraph_data *profile_data; @@ -849,26 +839,27 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace, } static void profile_graph_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct profile_fgraph_data *profile_data; struct ftrace_profile_stat *stat; unsigned long long calltime; unsigned long long rettime = trace_clock_local(); struct ftrace_profile *rec; - unsigned long flags; int size; - local_irq_save(flags); + guard(preempt_notrace)(); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) - goto out; + return; profile_data = fgraph_retrieve_data(gops->idx, &size); /* If the calltime was zero'd ignore it */ if (!profile_data || !profile_data->calltime) - goto out; + return; calltime = rettime - profile_data->calltime; @@ -896,9 +887,6 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, rec->time += calltime; rec->time_squared += calltime * calltime; } - - out: - local_irq_restore(flags); } static struct fgraph_ops fprofiler_ops = { @@ -946,20 +934,16 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, val = !!val; - mutex_lock(&ftrace_profile_lock); + guard(mutex)(&ftrace_profile_lock); if (ftrace_profile_enabled ^ val) { if (val) { ret = ftrace_profile_init(); - if (ret < 0) { - cnt = ret; - goto out; - } + if (ret < 0) + return ret; ret = register_ftrace_profiler(); - if (ret < 0) { - cnt = ret; - goto out; - } + if (ret < 0) + return ret; ftrace_profile_enabled = 1; } else { ftrace_profile_enabled = 0; @@ -970,8 +954,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, unregister_ftrace_profiler(); } } - out: - mutex_unlock(&ftrace_profile_lock); *ppos += cnt; @@ -1671,14 +1653,12 @@ unsigned long ftrace_location(unsigned long ip) loc = ftrace_location_range(ip, ip); if (!loc) { if (!kallsyms_lookup_size_offset(ip, &size, &offset)) - goto out; + return 0; /* map sym+0 to __fentry__ */ if (!offset) loc = ftrace_location_range(ip, ip + size - 1); } - -out: return loc; } @@ -2073,7 +2053,7 @@ rollback: continue; if (rec == end) - goto err_out; + return -EBUSY; in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); @@ -2086,7 +2066,6 @@ rollback: rec->flags |= FTRACE_FL_IPMODIFY; } while_for_each_ftrace_rec(); -err_out: return -EBUSY; } @@ -3238,15 +3217,22 @@ static struct ftrace_hash *copy_hash(struct ftrace_hash *src) * The filter_hash updates uses just the append_hash() function * and the notrace_hash does not. */ -static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash) +static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash, + int size_bits) { struct ftrace_func_entry *entry; int size; int i; - /* An empty hash does everything */ - if (ftrace_hash_empty(*hash)) - return 0; + if (*hash) { + /* An empty hash does everything */ + if (ftrace_hash_empty(*hash)) + return 0; + } else { + *hash = alloc_ftrace_hash(size_bits); + if (!*hash) + return -ENOMEM; + } /* If new_hash has everything make hash have everything */ if (ftrace_hash_empty(new_hash)) { @@ -3310,16 +3296,18 @@ static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_has /* Return a new hash that has a union of all @ops->filter_hash entries */ static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) { - struct ftrace_hash *new_hash; + struct ftrace_hash *new_hash = NULL; struct ftrace_ops *subops; + int size_bits; int ret; - new_hash = alloc_ftrace_hash(ops->func_hash->filter_hash->size_bits); - if (!new_hash) - return NULL; + if (ops->func_hash->filter_hash) + size_bits = ops->func_hash->filter_hash->size_bits; + else + size_bits = FTRACE_HASH_DEFAULT_BITS; list_for_each_entry(subops, &ops->subop_list, list) { - ret = append_hash(&new_hash, subops->func_hash->filter_hash); + ret = append_hash(&new_hash, subops->func_hash->filter_hash, size_bits); if (ret < 0) { free_ftrace_hash(new_hash); return NULL; @@ -3328,7 +3316,8 @@ static struct ftrace_hash *append_hashes(struct ftrace_ops *ops) if (ftrace_hash_empty(new_hash)) break; } - return new_hash; + /* Can't return NULL as that means this failed */ + return new_hash ? : EMPTY_HASH; } /* Make @ops trace evenything except what all its subops do not trace */ @@ -3523,7 +3512,8 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash); if (!filter_hash) return -ENOMEM; - ret = append_hash(&filter_hash, subops->func_hash->filter_hash); + ret = append_hash(&filter_hash, subops->func_hash->filter_hash, + size_bits); if (ret < 0) { free_ftrace_hash(filter_hash); return ret; @@ -4926,23 +4916,6 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, return __ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable); } -static bool module_exists(const char *module) -{ - /* All modules have the symbol __this_module */ - static const char this_mod[] = "__this_module"; - char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; - unsigned long val; - int n; - - n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); - - if (n > sizeof(modname) - 1) - return false; - - val = module_kallsyms_lookup_name(modname); - return val != 0; -} - static int cache_mod(struct trace_array *tr, const char *func, char *module, int enable) { @@ -4982,10 +4955,6 @@ static int cache_mod(struct trace_array *tr, return ftrace_add_mod(tr, func, module, enable); } -static int -ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, - int reset, int enable); - #ifdef CONFIG_MODULES static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, char *mod, bool enable) @@ -5615,20 +5584,15 @@ static DEFINE_MUTEX(ftrace_cmd_mutex); __init int register_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p; - int ret = 0; - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { - if (strcmp(cmd->name, p->name) == 0) { - ret = -EBUSY; - goto out_unlock; - } + if (strcmp(cmd->name, p->name) == 0) + return -EBUSY; } list_add(&cmd->list, &ftrace_commands); - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return 0; } /* @@ -5638,20 +5602,17 @@ __init int register_ftrace_command(struct ftrace_func_command *cmd) __init int unregister_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p, *n; - int ret = -ENODEV; - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); + list_for_each_entry_safe(p, n, &ftrace_commands, list) { if (strcmp(cmd->name, p->name) == 0) { - ret = 0; list_del_init(&p->list); - goto out_unlock; + return 0; } } - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return -ENODEV; } static int ftrace_process_regex(struct ftrace_iterator *iter, @@ -5661,7 +5622,7 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, struct trace_array *tr = iter->ops->private; char *func, *command, *next = buff; struct ftrace_func_command *p; - int ret = -EINVAL; + int ret; func = strsep(&next, ":"); @@ -5678,17 +5639,14 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, command = strsep(&next, ":"); - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); + list_for_each_entry(p, &ftrace_commands, list) { - if (strcmp(p->name, command) == 0) { - ret = p->func(tr, hash, func, command, next, enable); - goto out_unlock; - } + if (strcmp(p->name, command) == 0) + return p->func(tr, hash, func, command, next, enable); } - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return -EINVAL; } static ssize_t @@ -5722,12 +5680,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, parser->idx, enable); trace_parser_clear(parser); if (ret < 0) - goto out; + return ret; } - ret = read; - out: - return ret; + return read; } ssize_t @@ -5759,6 +5715,9 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return -ENOENT; free_hash_entry(hash, entry); return 0; + } else if (__ftrace_lookup_ip(hash, ip) != NULL) { + /* Already exists */ + return 0; } entry = add_hash_entry(hash, ip); @@ -5788,7 +5747,7 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips, static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long *ips, unsigned int cnt, - int remove, int reset, int enable) + int remove, int reset, int enable, char *mod) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; @@ -5814,7 +5773,15 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, goto out_regex_unlock; } - if (buf && !ftrace_match_records(hash, buf, len)) { + if (buf && !match_records(hash, buf, len, mod)) { + /* If this was for a module and nothing was enabled, flag it */ + if (mod) + (*orig_hash)->flags |= FTRACE_HASH_FL_MOD; + + /* + * Even if it is a mod, return error to let caller know + * nothing was added + */ ret = -EINVAL; goto out_regex_unlock; } @@ -5839,7 +5806,7 @@ static int ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, int remove, int reset, int enable) { - return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable); + return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable, NULL); } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -6217,7 +6184,38 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) { - return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable); + char *mod = NULL, *func, *command, *next = buf; + char *tmp __free(kfree) = NULL; + struct trace_array *tr = ops->private; + int ret; + + func = strsep(&next, ":"); + + /* This can also handle :mod: parsing */ + if (next) { + if (!tr) + return -EINVAL; + + command = strsep(&next, ":"); + if (strcmp(command, "mod") != 0) + return -EINVAL; + + mod = next; + len = command - func; + /* Save the original func as ftrace_set_hash() can modify it */ + tmp = kstrdup(func, GFP_KERNEL); + } + + ret = ftrace_set_hash(ops, func, len, NULL, 0, 0, reset, enable, mod); + + if (tr && mod && ret < 0) { + /* Did tmp fail to allocate? */ + if (!tmp) + return -ENOMEM; + ret = cache_mod(tr, tmp, mod, enable); + } + + return ret; } /** @@ -6381,6 +6379,14 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) ftrace_ops_init(ops); + /* The trace_array is needed for caching module function filters */ + if (!ops->private) { + struct trace_array *tr = trace_get_global_array(); + + ops->private = tr; + ftrace_init_trace_array(tr); + } + while (buf) { func = strsep(&buf, ","); ftrace_set_regex(ops, func, strlen(func), 0, enable); @@ -7814,9 +7820,14 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) void ftrace_init_trace_array(struct trace_array *tr) { + if (tr->flags & TRACE_ARRAY_FL_MOD_INIT) + return; + INIT_LIST_HEAD(&tr->func_probes); INIT_LIST_HEAD(&tr->mod_trace); INIT_LIST_HEAD(&tr->mod_notrace); + + tr->flags |= TRACE_ARRAY_FL_MOD_INIT; } #else @@ -7845,7 +7856,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) __init void ftrace_init_global_array_ops(struct trace_array *tr) { tr->ops = &global_ops; - tr->ops->private = tr; + if (!global_ops.private) + global_ops.private = tr; ftrace_init_trace_array(tr); init_array_fgraph_ops(tr, tr->ops); } @@ -8287,7 +8299,7 @@ pid_write(struct file *filp, const char __user *ubuf, if (!cnt) return 0; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); switch (type) { case TRACE_PIDS: @@ -8303,14 +8315,13 @@ pid_write(struct file *filp, const char __user *ubuf, lockdep_is_held(&ftrace_lock)); break; default: - ret = -EINVAL; WARN_ON_ONCE(1); - goto out; + return -EINVAL; } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) - goto out; + return ret; switch (type) { case TRACE_PIDS: @@ -8339,11 +8350,8 @@ pid_write(struct file *filp, const char __user *ubuf, ftrace_update_pid_func(); ftrace_startup_all(0); - out: - mutex_unlock(&ftrace_lock); - if (ret > 0) - *ppos += ret; + *ppos += ret; return ret; } @@ -8746,17 +8754,17 @@ static int ftrace_enable_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret = -ENODEV; + int ret; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); if (unlikely(ftrace_disabled)) - goto out; + return -ENODEV; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) - goto out; + return ret; if (ftrace_enabled) { @@ -8770,8 +8778,7 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write, } else { if (is_permanent_ops_registered()) { ftrace_enabled = true; - ret = -EBUSY; - goto out; + return -EBUSY; } /* stopping ftrace calls (just send to ftrace_stub) */ @@ -8781,12 +8788,10 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write, } last_ftrace_enabled = !!ftrace_enabled; - out: - mutex_unlock(&ftrace_lock); - return ret; + return 0; } -static struct ctl_table ftrace_sysctls[] = { +static const struct ctl_table ftrace_sysctls[] = { { .procname = "ftrace_enabled", .data = &ftrace_enabled, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 60210fb5b211..bb6089c2951e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1672,7 +1672,8 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) * must be the same. */ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, - struct trace_buffer *buffer, int nr_pages) + struct trace_buffer *buffer, int nr_pages, + unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; @@ -1680,6 +1681,9 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, unsigned long buffers_end; int i; + if (!subbuf_mask) + return false; + /* Check the meta magic and meta struct size */ if (meta->magic != RING_BUFFER_META_MAGIC || meta->struct_size != sizeof(*meta)) { @@ -1712,6 +1716,8 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, subbuf = rb_subbufs_from_meta(meta); + bitmap_clear(subbuf_mask, 0, meta->nr_subbufs); + /* Is the meta buffers and the subbufs themselves have correct data? */ for (i = 0; i < meta->nr_subbufs; i++) { if (meta->buffers[i] < 0 || @@ -1725,6 +1731,12 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, return false; } + if (test_bit(meta->buffers[i], subbuf_mask)) { + pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu); + return false; + } + + set_bit(meta->buffers[i], subbuf_mask); subbuf = (void *)subbuf + subbuf_size; } @@ -1838,6 +1850,11 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->cpu); goto invalid; } + + /* If the buffer has content, update pages_touched */ + if (ret) + local_inc(&cpu_buffer->pages_touched); + entries += ret; entry_bytes += local_read(&head_page->page->commit); local_set(&cpu_buffer->head_page->entries, ret); @@ -1889,17 +1906,22 @@ static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) { struct ring_buffer_meta *meta; + unsigned long *subbuf_mask; unsigned long delta; void *subbuf; int cpu; int i; + /* Create a mask to test the subbuf array */ + subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); + /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); - if (rb_meta_valid(meta, cpu, buffer, nr_pages)) { + if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; @@ -1943,6 +1965,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf += meta->subbuf_size; } } + bitmap_free(subbuf_mask); } static void *rbm_start(struct seq_file *m, loff_t *pos) @@ -4398,8 +4421,13 @@ rb_reserve_next_event(struct trace_buffer *buffer, int nr_loops = 0; int add_ts_default; - /* ring buffer does cmpxchg, make sure it is safe in NMI context */ - if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && + /* + * ring buffer does cmpxchg as well as atomic64 operations + * (which some archs use locking for atomic64), make sure this + * is safe in NMI context + */ + if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || + IS_ENABLED(CONFIG_GENERIC_ATOMIC64)) && (unlikely(in_nmi()))) { return NULL; } @@ -4682,40 +4710,22 @@ int ring_buffer_write(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_write); -static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) { - struct buffer_page *reader = cpu_buffer->reader_page; - struct buffer_page *head = rb_set_head_page(cpu_buffer); - struct buffer_page *commit = cpu_buffer->commit_page; - - /* In case of error, head will be NULL */ - if (unlikely(!head)) - return true; - - /* Reader should exhaust content in reader page */ - if (reader->read != rb_page_size(reader)) - return false; - - /* - * If writers are committing on the reader page, knowing all - * committed content has been read, the ring buffer is empty. - */ - if (commit == reader) - return true; - - /* - * If writers are committing on a page other than reader page - * and head page, there should always be content to read. - */ - if (commit != head) - return false; + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} - /* - * Writers are committing on the head page, we just need - * to care about there're committed data, and the reader will - * swap reader page with head page when it is to read data. - */ - return rb_page_commit(commit) == 0; +static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +{ + return !rb_num_of_entries(cpu_buffer); } /** @@ -4861,19 +4871,6 @@ void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); -/* - * The total entries in the ring buffer is the running counter - * of entries entered into the ring buffer, minus the sum of - * the entries read from the ring buffer and the number of - * entries that were overwritten. - */ -static inline unsigned long -rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) -{ - return local_read(&cpu_buffer->entries) - - (local_read(&cpu_buffer->overrun) + cpu_buffer->read); -} - /** * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer * @buffer: The ring buffer @@ -7059,7 +7056,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, } while (p < nr_pages) { - struct page *page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); + struct page *page; int off = 0; if (WARN_ON_ONCE(s >= nr_subbufs)) { @@ -7067,6 +7064,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, goto out; } + page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); + for (; off < (1 << (subbuf_order)); off++, page++) { if (p >= nr_pages) break; @@ -7150,6 +7149,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, kfree(cpu_buffer->subbuf_ids); cpu_buffer->subbuf_ids = NULL; rb_free_meta_page(cpu_buffer); + atomic_dec(&cpu_buffer->resize_disabled); } unlock: diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 831779607e84..8226352a0062 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -25,30 +25,9 @@ menuconfig RV For further information, see: Documentation/trace/rv/runtime-verification.rst -config RV_MON_WIP - depends on RV - depends on PREEMPT_TRACER - select DA_MON_EVENTS_IMPLICIT - bool "wip monitor" - help - Enable wip (wakeup in preemptive) sample monitor that illustrates - the usage of per-cpu monitors, and one limitation of the - preempt_disable/enable events. - - For further information, see: - Documentation/trace/rv/monitor_wip.rst - -config RV_MON_WWNR - depends on RV - select DA_MON_EVENTS_ID - bool "wwnr monitor" - help - Enable wwnr (wakeup while not running) sample monitor, this is a - sample monitor that illustrates the usage of per-task monitor. - The model is borken on purpose: it serves to test reactors. - - For further information, see: - Documentation/trace/rv/monitor_wwnr.rst +source "kernel/trace/rv/monitors/wip/Kconfig" +source "kernel/trace/rv/monitors/wwnr/Kconfig" +# Add new monitors here config RV_REACTORS bool "Runtime verification reactors" diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 963d14875b45..188b64668e1f 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -1,8 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 +ccflags-y += -I $(src) # needed for trace events + obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o +# Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig new file mode 100644 index 000000000000..3ef664b5cd90 --- /dev/null +++ b/kernel/trace/rv/monitors/wip/Kconfig @@ -0,0 +1,12 @@ +config RV_MON_WIP + depends on RV + depends on PREEMPT_TRACER + select DA_MON_EVENTS_IMPLICIT + bool "wip monitor" + help + Enable wip (wakeup in preemptive) sample monitor that illustrates + the usage of per-cpu monitors, and one limitation of the + preempt_disable/enable events. + + For further information, see: + Documentation/trace/rv/monitor_wip.rst diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index b2b49a27e886..db7389157c87 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -10,7 +10,7 @@ #define MODULE_NAME "wip" -#include <trace/events/rv.h> +#include <rv_trace.h> #include <trace/events/sched.h> #include <trace/events/preemptirq.h> diff --git a/kernel/trace/rv/monitors/wip/wip_trace.h b/kernel/trace/rv/monitors/wip/wip_trace.h new file mode 100644 index 000000000000..aa2162f47a4c --- /dev/null +++ b/kernel/trace/rv/monitors/wip/wip_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_WIP +DEFINE_EVENT(event_da_monitor, event_wip, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_wip, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_WIP */ diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig new file mode 100644 index 000000000000..ee741aa6d6b8 --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/Kconfig @@ -0,0 +1,11 @@ +config RV_MON_WWNR + depends on RV + select DA_MON_EVENTS_ID + bool "wwnr monitor" + help + Enable wwnr (wakeup while not running) sample monitor, this is a + sample monitor that illustrates the usage of per-task monitor. + The model is borken on purpose: it serves to test reactors. + + For further information, see: + Documentation/trace/rv/monitor_wwnr.rst diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 0e43dd2db685..3b16994a9984 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -10,7 +10,7 @@ #define MODULE_NAME "wwnr" -#include <trace/events/rv.h> +#include <rv_trace.h> #include <trace/events/sched.h> #include "wwnr.h" diff --git a/kernel/trace/rv/monitors/wwnr/wwnr_trace.h b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h new file mode 100644 index 000000000000..fc97ec7476ad --- /dev/null +++ b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_WWNR +/* id is the pid of the task */ +DEFINE_EVENT(event_da_monitor_id, event_wwnr, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_wwnr, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_WWNR */ diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 279c70e1bd74..8657fc8806e7 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -145,7 +145,7 @@ #ifdef CONFIG_DA_MON_EVENTS #define CREATE_TRACE_POINTS -#include <trace/events/rv.h> +#include <rv_trace.h> #endif #include "rv.h" diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h new file mode 100644 index 000000000000..5e65097423ba --- /dev/null +++ b/kernel/trace/rv/rv_trace.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rv + +#if !defined(_TRACE_RV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RV_H + +#include <linux/rv.h> +#include <linux/tracepoint.h> + +#ifdef CONFIG_DA_MON_EVENTS_IMPLICIT +DECLARE_EVENT_CLASS(event_da_monitor, + + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + + TP_ARGS(state, event, next_state, final_state), + + TP_STRUCT__entry( + __array( char, state, MAX_DA_NAME_LEN ) + __array( char, event, MAX_DA_NAME_LEN ) + __array( char, next_state, MAX_DA_NAME_LEN ) + __field( bool, final_state ) + ), + + TP_fast_assign( + memcpy(__entry->state, state, MAX_DA_NAME_LEN); + memcpy(__entry->event, event, MAX_DA_NAME_LEN); + memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); + __entry->final_state = final_state; + ), + + TP_printk("%s x %s -> %s %s", + __entry->state, + __entry->event, + __entry->next_state, + __entry->final_state ? "(final)" : "") +); + +DECLARE_EVENT_CLASS(error_da_monitor, + + TP_PROTO(char *state, char *event), + + TP_ARGS(state, event), + + TP_STRUCT__entry( + __array( char, state, MAX_DA_NAME_LEN ) + __array( char, event, MAX_DA_NAME_LEN ) + ), + + TP_fast_assign( + memcpy(__entry->state, state, MAX_DA_NAME_LEN); + memcpy(__entry->event, event, MAX_DA_NAME_LEN); + ), + + TP_printk("event %s not expected in the state %s", + __entry->event, + __entry->state) +); + +#include <monitors/wip/wip_trace.h> +// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here + +#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ + +#ifdef CONFIG_DA_MON_EVENTS_ID +DECLARE_EVENT_CLASS(event_da_monitor_id, + + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + + TP_ARGS(id, state, event, next_state, final_state), + + TP_STRUCT__entry( + __field( int, id ) + __array( char, state, MAX_DA_NAME_LEN ) + __array( char, event, MAX_DA_NAME_LEN ) + __array( char, next_state, MAX_DA_NAME_LEN ) + __field( bool, final_state ) + ), + + TP_fast_assign( + memcpy(__entry->state, state, MAX_DA_NAME_LEN); + memcpy(__entry->event, event, MAX_DA_NAME_LEN); + memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); + __entry->id = id; + __entry->final_state = final_state; + ), + + TP_printk("%d: %s x %s -> %s %s", + __entry->id, + __entry->state, + __entry->event, + __entry->next_state, + __entry->final_state ? "(final)" : "") +); + +DECLARE_EVENT_CLASS(error_da_monitor_id, + + TP_PROTO(int id, char *state, char *event), + + TP_ARGS(id, state, event), + + TP_STRUCT__entry( + __field( int, id ) + __array( char, state, MAX_DA_NAME_LEN ) + __array( char, event, MAX_DA_NAME_LEN ) + ), + + TP_fast_assign( + memcpy(__entry->state, state, MAX_DA_NAME_LEN); + memcpy(__entry->event, event, MAX_DA_NAME_LEN); + __entry->id = id; + ), + + TP_printk("%d: event %s not expected in the state %s", + __entry->id, + __entry->event, + __entry->state) +); + +#include <monitors/wwnr/wwnr_trace.h> +// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here + +#endif /* CONFIG_DA_MON_EVENTS_ID */ +#endif /* _TRACE_RV_H */ + +/* This part ust be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE rv_trace +#include <trace/define_trace.h> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b6e40e8791fa..fd3cb2b2ab82 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -26,6 +26,7 @@ #include <linux/hardirq.h> #include <linux/linkage.h> #include <linux/uaccess.h> +#include <linux/cleanup.h> #include <linux/vmalloc.h> #include <linux/ftrace.h> #include <linux/module.h> @@ -535,19 +536,16 @@ LIST_HEAD(ftrace_trace_arrays); int trace_array_get(struct trace_array *this_tr) { struct trace_array *tr; - int ret = -ENODEV; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (tr == this_tr) { tr->ref++; - ret = 0; - break; + return 0; } } - mutex_unlock(&trace_types_lock); - return ret; + return -ENODEV; } static void __trace_array_put(struct trace_array *this_tr) @@ -1443,22 +1441,20 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) { - struct cond_snapshot *cond_snapshot; - int ret = 0; + struct cond_snapshot *cond_snapshot __free(kfree) = + kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); + int ret; - cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); if (!cond_snapshot) return -ENOMEM; cond_snapshot->cond_data = cond_data; cond_snapshot->update = update; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); - if (tr->current_trace->use_max_tr) { - ret = -EBUSY; - goto fail_unlock; - } + if (tr->current_trace->use_max_tr) + return -EBUSY; /* * The cond_snapshot can only change to NULL without the @@ -1468,29 +1464,20 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, * do safely with only holding the trace_types_lock and not * having to take the max_lock. */ - if (tr->cond_snapshot) { - ret = -EBUSY; - goto fail_unlock; - } + if (tr->cond_snapshot) + return -EBUSY; ret = tracing_arm_snapshot_locked(tr); if (ret) - goto fail_unlock; + return ret; local_irq_disable(); arch_spin_lock(&tr->max_lock); - tr->cond_snapshot = cond_snapshot; + tr->cond_snapshot = no_free_ptr(cond_snapshot); arch_spin_unlock(&tr->max_lock); local_irq_enable(); - mutex_unlock(&trace_types_lock); - - return ret; - - fail_unlock: - mutex_unlock(&trace_types_lock); - kfree(cond_snapshot); - return ret; + return 0; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); @@ -2203,10 +2190,10 @@ static __init int init_trace_selftests(void) selftests_can_run = true; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (list_empty(&postponed_selftests)) - goto out; + return 0; pr_info("Running postponed tracer tests:\n"); @@ -2235,9 +2222,6 @@ static __init int init_trace_selftests(void) } tracing_selftest_running = false; - out: - mutex_unlock(&trace_types_lock); - return 0; } core_initcall(init_trace_selftests); @@ -2807,7 +2791,7 @@ int tracepoint_printk_sysctl(const struct ctl_table *table, int write, int save_tracepoint_printk; int ret; - mutex_lock(&tracepoint_printk_mutex); + guard(mutex)(&tracepoint_printk_mutex); save_tracepoint_printk = tracepoint_printk; ret = proc_dointvec(table, write, buffer, lenp, ppos); @@ -2820,16 +2804,13 @@ int tracepoint_printk_sysctl(const struct ctl_table *table, int write, tracepoint_printk = 0; if (save_tracepoint_printk == tracepoint_printk) - goto out; + return ret; if (tracepoint_printk) static_key_enable(&tracepoint_printk_key.key); else static_key_disable(&tracepoint_printk_key.key); - out: - mutex_unlock(&tracepoint_printk_mutex); - return ret; } @@ -4119,12 +4100,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) entries, total, buf->cpu, - preempt_model_none() ? "server" : - preempt_model_voluntary() ? "desktop" : - preempt_model_full() ? "preempt" : - preempt_model_lazy() ? "lazy" : - preempt_model_rt() ? "preempt_rt" : - "unknown", + preempt_model_str(), /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP @@ -5127,7 +5103,8 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) u32 tracer_flags; int i; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); + tracer_flags = tr->current_trace->flags->val; trace_opts = tr->current_trace->flags->opts; @@ -5144,7 +5121,6 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) else seq_printf(m, "no%s\n", trace_opts[i].name); } - mutex_unlock(&trace_types_lock); return 0; } @@ -5537,6 +5513,8 @@ static const char readme_msg[] = "\t efield: For event probes ('e' types), the field is on of the fields\n" "\t of the <attached-group>/<attached-event>.\n" #endif + " set_event\t\t- Enables events by name written into it\n" + "\t\t\t Can enable module events via: :mod:<module>\n" " events/\t\t- Directory containing all trace event subsystems:\n" " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" " events/<system>/\t- Directory containing all trace events for <system>:\n" @@ -5809,7 +5787,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, return; } - mutex_lock(&trace_eval_mutex); + guard(mutex)(&trace_eval_mutex); if (!trace_eval_maps) trace_eval_maps = map_array; @@ -5833,8 +5811,6 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, map_array++; } memset(map_array, 0, sizeof(*map_array)); - - mutex_unlock(&trace_eval_mutex); } static void trace_create_eval_file(struct dentry *d_tracer) @@ -5996,26 +5972,15 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id) { - int ret; - - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (cpu_id != RING_BUFFER_ALL_CPUS) { /* make sure, this cpu is enabled in the mask */ - if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { - ret = -EINVAL; - goto out; - } + if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) + return -EINVAL; } - ret = __tracing_resize_ring_buffer(tr, size, cpu_id); - if (ret < 0) - ret = -ENOMEM; - -out: - mutex_unlock(&trace_types_lock); - - return ret; + return __tracing_resize_ring_buffer(tr, size, cpu_id); } static void update_last_data(struct trace_array *tr) @@ -6106,9 +6071,9 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; #endif - int ret = 0; + int ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); update_last_data(tr); @@ -6116,7 +6081,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) - goto out; + return ret; ret = 0; } @@ -6124,43 +6089,37 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (strcmp(t->name, buf) == 0) break; } - if (!t) { - ret = -EINVAL; - goto out; - } + if (!t) + return -EINVAL; + if (t == tr->current_trace) - goto out; + return 0; #ifdef CONFIG_TRACER_SNAPSHOT if (t->use_max_tr) { local_irq_disable(); arch_spin_lock(&tr->max_lock); - if (tr->cond_snapshot) - ret = -EBUSY; + ret = tr->cond_snapshot ? -EBUSY : 0; arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) - goto out; + return ret; } #endif /* Some tracers won't work on kernel command line */ if (system_state < SYSTEM_RUNNING && t->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", t->name); - goto out; + return -EINVAL; } /* Some tracers are only allowed for the top level buffer */ - if (!trace_ok_for_array(t, tr)) { - ret = -EINVAL; - goto out; - } + if (!trace_ok_for_array(t, tr)) + return -EINVAL; /* If trace pipe files are being read, we can't change the tracer */ - if (tr->trace_ref) { - ret = -EBUSY; - goto out; - } + if (tr->trace_ref) + return -EBUSY; trace_branch_disable(); @@ -6191,7 +6150,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (!had_max_tr && t->use_max_tr) { ret = tracing_arm_snapshot_locked(tr); if (ret) - goto out; + return ret; } #else tr->current_trace = &nop_trace; @@ -6204,17 +6163,15 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t->use_max_tr) tracing_disarm_snapshot(tr); #endif - goto out; + return ret; } } tr->current_trace = t; tr->current_trace->enabled++; trace_branch_enable(tr); - out: - mutex_unlock(&trace_types_lock); - return ret; + return 0; } static ssize_t @@ -6292,22 +6249,18 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf, struct trace_array *tr = filp->private_data; int ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); if (ret < 0) - goto out; + return ret; if (tr->current_trace->update_thresh) { ret = tr->current_trace->update_thresh(tr); if (ret < 0) - goto out; + return ret; } - ret = cnt; -out: - mutex_unlock(&trace_types_lock); - - return ret; + return cnt; } #ifdef CONFIG_TRACER_MAX_TRACE @@ -6526,31 +6479,29 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, * This is just a matter of traces coherency, the ring buffer itself * is protected. */ - mutex_lock(&iter->mutex); + guard(mutex)(&iter->mutex); /* return any leftover data */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (sret != -EBUSY) - goto out; + return sret; trace_seq_init(&iter->seq); if (iter->trace->read) { sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); if (sret) - goto out; + return sret; } waitagain: sret = tracing_wait_pipe(filp); if (sret <= 0) - goto out; + return sret; /* stop when tracing is finished */ - if (trace_empty(iter)) { - sret = 0; - goto out; - } + if (trace_empty(iter)) + return 0; if (cnt >= TRACE_SEQ_BUFFER_SIZE) cnt = TRACE_SEQ_BUFFER_SIZE - 1; @@ -6614,9 +6565,6 @@ waitagain: if (sret == -EBUSY) goto waitagain; -out: - mutex_unlock(&iter->mutex); - return sret; } @@ -7208,25 +7156,19 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve */ int tracing_set_filter_buffering(struct trace_array *tr, bool set) { - int ret = 0; - - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (set && tr->no_filter_buffering_ref++) - goto out; + return 0; if (!set) { - if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) { - ret = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) + return -EINVAL; --tr->no_filter_buffering_ref; } - out: - mutex_unlock(&trace_types_lock); - return ret; + return 0; } struct ftrace_buffer_info { @@ -7302,12 +7244,10 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); - if (tr->current_trace->use_max_tr) { - ret = -EBUSY; - goto out; - } + if (tr->current_trace->use_max_tr) + return -EBUSY; local_irq_disable(); arch_spin_lock(&tr->max_lock); @@ -7316,24 +7256,20 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, arch_spin_unlock(&tr->max_lock); local_irq_enable(); if (ret) - goto out; + return ret; switch (val) { case 0: - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { - ret = -EINVAL; - break; - } + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; if (tr->allocated_snapshot) free_snapshot(tr); break; case 1: /* Only allow per-cpu swap if the ring buffer supports it */ #ifndef CONFIG_RING_BUFFER_ALLOW_SWAP - if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { - ret = -EINVAL; - break; - } + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) + return -EINVAL; #endif if (tr->allocated_snapshot) ret = resize_buffer_duplicate_size(&tr->max_buffer, @@ -7341,7 +7277,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, ret = tracing_arm_snapshot_locked(tr); if (ret) - break; + return ret; /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { @@ -7368,8 +7304,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, *ppos += cnt; ret = cnt; } -out: - mutex_unlock(&trace_types_lock); + return ret; } @@ -7755,12 +7690,11 @@ void tracing_log_err(struct trace_array *tr, len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1; - mutex_lock(&tracing_err_log_lock); + guard(mutex)(&tracing_err_log_lock); + err = get_tracing_log_err(tr, len); - if (PTR_ERR(err) == -ENOMEM) { - mutex_unlock(&tracing_err_log_lock); + if (PTR_ERR(err) == -ENOMEM) return; - } snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd); @@ -7771,7 +7705,6 @@ void tracing_log_err(struct trace_array *tr, err->info.ts = local_clock(); list_add_tail(&err->list, &tr->err_log); - mutex_unlock(&tracing_err_log_lock); } static void clear_tracing_err_log(struct trace_array *tr) @@ -8341,6 +8274,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) struct trace_iterator *iter = &info->iter; int ret = 0; + /* Currently the boot mapped buffer is not supported for mmap */ + if (iter->tr->flags & TRACE_ARRAY_FL_BOOT) + return -ENODEV; + ret = get_snapshot_map(iter->tr); if (ret) return ret; @@ -9467,6 +9404,10 @@ trace_array_create_systems(const char *name, const char *systems, INIT_LIST_HEAD(&tr->hist_vars); INIT_LIST_HEAD(&tr->err_log); +#ifdef CONFIG_MODULES + INIT_LIST_HEAD(&tr->mod_events); +#endif + if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; @@ -9515,20 +9456,17 @@ static int instance_mkdir(const char *name) struct trace_array *tr; int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); ret = -EEXIST; if (trace_array_find(name)) - goto out_unlock; + return -EEXIST; tr = trace_array_create(name); ret = PTR_ERR_OR_ZERO(tr); -out_unlock: - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); return ret; } @@ -9578,24 +9516,23 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system { struct trace_array *tr; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) - goto out_unlock; + if (tr->name && strcmp(tr->name, name) == 0) { + tr->ref++; + return tr; + } } tr = trace_array_create_systems(name, systems, 0, 0); if (IS_ERR(tr)) tr = NULL; -out_unlock: - if (tr) + else tr->ref++; - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); return tr; } EXPORT_SYMBOL_GPL(trace_array_get_by_name); @@ -9646,48 +9583,36 @@ static int __remove_instance(struct trace_array *tr) int trace_array_destroy(struct trace_array *this_tr) { struct trace_array *tr; - int ret; if (!this_tr) return -EINVAL; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); - ret = -ENODEV; /* Making sure trace array exists before destroying it. */ list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr == this_tr) { - ret = __remove_instance(tr); - break; - } + if (tr == this_tr) + return __remove_instance(tr); } - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); - - return ret; + return -ENODEV; } EXPORT_SYMBOL_GPL(trace_array_destroy); static int instance_rmdir(const char *name) { struct trace_array *tr; - int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); - ret = -ENODEV; tr = trace_array_find(name); - if (tr) - ret = __remove_instance(tr); - - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); + if (!tr) + return -ENODEV; - return ret; + return __remove_instance(tr); } static __init void create_trace_instances(struct dentry *d_tracer) @@ -9700,19 +9625,16 @@ static __init void create_trace_instances(struct dentry *d_tracer) if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n")) return; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&event_mutex); + guard(mutex)(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (!tr->name) continue; if (MEM_FAIL(trace_array_create_dir(tr) < 0, "Failed to create instance directory\n")) - break; + return; } - - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); } static void @@ -9902,6 +9824,24 @@ late_initcall_sync(trace_eval_sync); #ifdef CONFIG_MODULES + +bool module_exists(const char *module) +{ + /* All modules have the symbol __this_module */ + static const char this_mod[] = "__this_module"; + char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; + unsigned long val; + int n; + + n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod); + + if (n > sizeof(modname) - 1) + return false; + + val = module_kallsyms_lookup_name(modname); + return val != 0; +} + static void trace_module_add_evals(struct module *mod) { if (!mod->num_trace_evals) @@ -9926,7 +9866,7 @@ static void trace_module_remove_evals(struct module *mod) if (!mod->num_trace_evals) return; - mutex_lock(&trace_eval_mutex); + guard(mutex)(&trace_eval_mutex); map = trace_eval_maps; @@ -9938,12 +9878,10 @@ static void trace_module_remove_evals(struct module *mod) map = map->tail.next; } if (!map) - goto out; + return; *last = trace_eval_jmp_to_tail(map)->tail.next; kfree(map); - out: - mutex_unlock(&trace_eval_mutex); } #else static inline void trace_module_remove_evals(struct module *mod) { } @@ -10616,6 +10554,10 @@ __init static int tracer_alloc_buffers(void) #endif ftrace_init_global_array_ops(&global_trace); +#ifdef CONFIG_MODULES + INIT_LIST_HEAD(&global_trace.mod_events); +#endif + init_trace_flags_index(&global_trace); register_tracer(&nop_trace); @@ -10661,6 +10603,14 @@ out: return ret; } +#ifdef CONFIG_FUNCTION_TRACER +/* Used to set module cached ftrace filtering at boot up */ +__init struct trace_array *trace_get_global_array(void) +{ + return &global_trace; +} +#endif + void __init ftrace_boot_snapshot(void) { #ifdef CONFIG_TRACER_MAX_TRACE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 179676db622e..9c21ba45b7af 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -400,6 +400,9 @@ struct trace_array { cpumask_var_t pipe_cpumask; int ref; int trace_ref; +#ifdef CONFIG_MODULES + struct list_head mod_events; +#endif #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; @@ -432,8 +435,18 @@ struct trace_array { enum { TRACE_ARRAY_FL_GLOBAL = BIT(0), TRACE_ARRAY_FL_BOOT = BIT(1), + TRACE_ARRAY_FL_MOD_INIT = BIT(2), }; +#ifdef CONFIG_MODULES +bool module_exists(const char *module); +#else +static inline bool module_exists(const char *module) +{ + return false; +} +#endif + extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; @@ -693,8 +706,10 @@ void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); -void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops); -int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops); +void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); +int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); @@ -909,7 +924,9 @@ extern int __trace_graph_retaddr_entry(struct trace_array *tr, unsigned long retaddr); extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, - unsigned int trace_ctx); + unsigned int trace_ctx, + u64 calltime, u64 rettime); + extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops); extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops); extern void free_fgraph_ops(struct trace_array *tr); @@ -1112,6 +1129,7 @@ void ftrace_destroy_function_files(struct trace_array *tr); int ftrace_allocate_ftrace_ops(struct trace_array *tr); void ftrace_free_ftrace_ops(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); +struct trace_array *trace_get_global_array(void); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 4376887e0d8a..a322e4f249a5 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -74,24 +74,19 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type struct dyn_event *pos, *n; char *system = NULL, *event, *p; int argc, ret = -ENOENT; - char **argv; + char **argv __free(argv_free) = argv_split(GFP_KERNEL, raw_command, &argc); - argv = argv_split(GFP_KERNEL, raw_command, &argc); if (!argv) return -ENOMEM; if (argv[0][0] == '-') { - if (argv[0][1] != ':') { - ret = -EINVAL; - goto out; - } + if (argv[0][1] != ':') + return -EINVAL; event = &argv[0][2]; } else { event = strchr(argv[0], ':'); - if (!event) { - ret = -EINVAL; - goto out; - } + if (!event) + return -EINVAL; event++; } @@ -101,10 +96,8 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type event = p + 1; *p = '\0'; } - if (!system && event[0] == '\0') { - ret = -EINVAL; - goto out; - } + if (!system && event[0] == '\0') + return -EINVAL; mutex_lock(&event_mutex); for_each_dyn_event_safe(pos, n) { @@ -120,8 +113,6 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type } tracing_reset_all_online_cpus(); mutex_unlock(&event_mutex); -out: - argv_free(argv); return ret; } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 82fd174ebbe0..fbfb396905a6 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -124,8 +124,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_packed( unsigned long, ret, retval ) __field_packed( int, ret, depth ) __field_packed( unsigned int, ret, overrun ) - __field_packed( unsigned long long, ret, calltime) - __field_packed( unsigned long long, ret, rettime ) + __field(unsigned long long, calltime ) + __field(unsigned long long, rettime ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx", @@ -146,8 +146,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_packed( unsigned long, ret, func ) __field_packed( int, ret, depth ) __field_packed( unsigned int, ret, overrun ) - __field_packed( unsigned long long, ret, calltime) - __field_packed( unsigned long long, ret, rettime ) + __field(unsigned long long, calltime ) + __field(unsigned long long, rettime ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index be8be0c1aaf0..82fd637cfc19 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -917,10 +917,10 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } - mutex_lock(&event_mutex); - event_call = find_and_get_event(sys_name, sys_event); - ep = alloc_event_probe(group, event, event_call, argc - 2); - mutex_unlock(&event_mutex); + scoped_guard(mutex, &event_mutex) { + event_call = find_and_get_event(sys_name, sys_event); + ep = alloc_event_probe(group, event, event_call, argc - 2); + } if (IS_ERR(ep)) { ret = PTR_ERR(ep); @@ -952,23 +952,21 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (ret < 0) goto error; init_trace_eprobe_call(ep); - mutex_lock(&event_mutex); - ret = trace_probe_register_event_call(&ep->tp); - if (ret) { - if (ret == -EEXIST) { - trace_probe_log_set_index(0); - trace_probe_log_err(0, EVENT_EXIST); + scoped_guard(mutex, &event_mutex) { + ret = trace_probe_register_event_call(&ep->tp); + if (ret) { + if (ret == -EEXIST) { + trace_probe_log_set_index(0); + trace_probe_log_err(0, EVENT_EXIST); + } + goto error; + } + ret = dyn_event_add(&ep->devent, &ep->tp.event->call); + if (ret < 0) { + trace_probe_unregister_event_call(&ep->tp); + goto error; } - mutex_unlock(&event_mutex); - goto error; - } - ret = dyn_event_add(&ep->devent, &ep->tp.event->call); - if (ret < 0) { - trace_probe_unregister_event_call(&ep->tp); - mutex_unlock(&event_mutex); - goto error; } - mutex_unlock(&event_mutex); return ret; parse_error: ret = -EINVAL; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 3ff9caa4a71b..a6bb7577e8c5 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -49,7 +49,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event)) { - ret = perf_allow_tracepoint(&p_event->attr); + ret = perf_allow_tracepoint(); if (ret) return ret; @@ -86,7 +86,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event, * ...otherwise raw tracepoint data can be a severe data leak, * only allow root to have these. */ - ret = perf_allow_tracepoint(&p_event->attr); + ret = perf_allow_tracepoint(); if (ret) return ret; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 770e7ed91716..513de9ceb80e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -869,6 +869,120 @@ static int ftrace_event_enable_disable(struct trace_event_file *file, return __ftrace_event_enable_disable(file, enable, 0); } +#ifdef CONFIG_MODULES +struct event_mod_load { + struct list_head list; + char *module; + char *match; + char *system; + char *event; +}; + +static void free_event_mod(struct event_mod_load *event_mod) +{ + list_del(&event_mod->list); + kfree(event_mod->module); + kfree(event_mod->match); + kfree(event_mod->system); + kfree(event_mod->event); + kfree(event_mod); +} + +static void clear_mod_events(struct trace_array *tr) +{ + struct event_mod_load *event_mod, *n; + + list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { + free_event_mod(event_mod); + } +} + +static int remove_cache_mod(struct trace_array *tr, const char *mod, + const char *match, const char *system, const char *event) +{ + struct event_mod_load *event_mod, *n; + int ret = -EINVAL; + + list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { + if (strcmp(event_mod->module, mod) != 0) + continue; + + if (match && strcmp(event_mod->match, match) != 0) + continue; + + if (system && + (!event_mod->system || strcmp(event_mod->system, system) != 0)) + continue; + + if (event && + (!event_mod->event || strcmp(event_mod->event, event) != 0)) + continue; + + free_event_mod(event_mod); + ret = 0; + } + + return ret; +} + +static int cache_mod(struct trace_array *tr, const char *mod, int set, + const char *match, const char *system, const char *event) +{ + struct event_mod_load *event_mod; + + /* If the module exists, then this just failed to find an event */ + if (module_exists(mod)) + return -EINVAL; + + /* See if this is to remove a cached filter */ + if (!set) + return remove_cache_mod(tr, mod, match, system, event); + + event_mod = kzalloc(sizeof(*event_mod), GFP_KERNEL); + if (!event_mod) + return -ENOMEM; + + INIT_LIST_HEAD(&event_mod->list); + event_mod->module = kstrdup(mod, GFP_KERNEL); + if (!event_mod->module) + goto out_free; + + if (match) { + event_mod->match = kstrdup(match, GFP_KERNEL); + if (!event_mod->match) + goto out_free; + } + + if (system) { + event_mod->system = kstrdup(system, GFP_KERNEL); + if (!event_mod->system) + goto out_free; + } + + if (event) { + event_mod->event = kstrdup(event, GFP_KERNEL); + if (!event_mod->event) + goto out_free; + } + + list_add(&event_mod->list, &tr->mod_events); + + return 0; + + out_free: + free_event_mod(event_mod); + + return -ENOMEM; +} +#else /* CONFIG_MODULES */ +static inline void clear_mod_events(struct trace_array *tr) { } +static int cache_mod(struct trace_array *tr, const char *mod, int set, + const char *match, const char *system, const char *event) +{ + return -EINVAL; +} +#endif + static void ftrace_clear_events(struct trace_array *tr) { struct trace_event_file *file; @@ -877,6 +991,7 @@ static void ftrace_clear_events(struct trace_array *tr) list_for_each_entry(file, &tr->events, list) { ftrace_event_enable_disable(file, 0); } + clear_mod_events(tr); mutex_unlock(&event_mutex); } @@ -1165,17 +1280,36 @@ static void remove_event_file_dir(struct trace_event_file *file) */ static int __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, - const char *sub, const char *event, int set) + const char *sub, const char *event, int set, + const char *mod) { struct trace_event_file *file; struct trace_event_call *call; + char *module __free(kfree) = NULL; const char *name; int ret = -EINVAL; int eret = 0; + if (mod) { + char *p; + + module = kstrdup(mod, GFP_KERNEL); + if (!module) + return -ENOMEM; + + /* Replace all '-' with '_' as that's what modules do */ + for (p = strchr(module, '-'); p; p = strchr(p + 1, '-')) + *p = '_'; + } + list_for_each_entry(file, &tr->events, list) { call = file->event_call; + + /* If a module is specified, skip events that are not that module */ + if (module && (!call->module || strcmp(module_name(call->module), module))) + continue; + name = trace_event_name(call); if (!name || !call->class || !call->class->reg) @@ -1208,16 +1342,24 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, ret = eret; } + /* + * If this is a module setting and nothing was found, + * check if the module was loaded. If it wasn't cache it. + */ + if (module && ret == -EINVAL && !eret) + ret = cache_mod(tr, module, set, match, sub, event); + return ret; } static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, - const char *sub, const char *event, int set) + const char *sub, const char *event, int set, + const char *mod) { int ret; mutex_lock(&event_mutex); - ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set); + ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod); mutex_unlock(&event_mutex); return ret; @@ -1225,11 +1367,20 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { - char *event = NULL, *sub = NULL, *match; + char *event = NULL, *sub = NULL, *match, *mod; int ret; if (!tr) return -ENOENT; + + /* Modules events can be appened with :mod:<module> */ + mod = strstr(buf, ":mod:"); + if (mod) { + *mod = '\0'; + /* move to the module name */ + mod += 5; + } + /* * The buf format can be <subsystem>:<event-name> * *:<event-name> means any event by that name. @@ -1252,9 +1403,13 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) sub = NULL; if (!strlen(event) || strcmp(event, "*") == 0) event = NULL; + } else if (mod) { + /* Allow wildcard for no length or star */ + if (!strlen(match) || strcmp(match, "*") == 0) + match = NULL; } - ret = __ftrace_set_clr_event(tr, match, sub, event, set); + ret = __ftrace_set_clr_event(tr, match, sub, event, set, mod); /* Put back the colon to allow this to be called again */ if (buf) @@ -1282,7 +1437,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) if (!tr) return -ENODEV; - return __ftrace_set_clr_event(tr, NULL, system, event, set); + return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL); } EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -1308,7 +1463,7 @@ int trace_array_set_clr_event(struct trace_array *tr, const char *system, return -ENOENT; set = (enable == true) ? 1 : 0; - return __ftrace_set_clr_event(tr, NULL, system, event, set); + return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL); } EXPORT_SYMBOL_GPL(trace_array_set_clr_event); @@ -1395,37 +1550,78 @@ static void *t_start(struct seq_file *m, loff_t *pos) return file; } +enum set_event_iter_type { + SET_EVENT_FILE, + SET_EVENT_MOD, +}; + +struct set_event_iter { + enum set_event_iter_type type; + union { + struct trace_event_file *file; + struct event_mod_load *event_mod; + }; +}; + static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_event_file *file = v; + struct set_event_iter *iter = v; + struct trace_event_file *file; struct trace_array *tr = m->private; (*pos)++; - list_for_each_entry_continue(file, &tr->events, list) { - if (file->flags & EVENT_FILE_FL_ENABLED) - return file; + if (iter->type == SET_EVENT_FILE) { + file = iter->file; + list_for_each_entry_continue(file, &tr->events, list) { + if (file->flags & EVENT_FILE_FL_ENABLED) { + iter->file = file; + return iter; + } + } +#ifdef CONFIG_MODULES + iter->type = SET_EVENT_MOD; + iter->event_mod = list_entry(&tr->mod_events, struct event_mod_load, list); +#endif } +#ifdef CONFIG_MODULES + list_for_each_entry_continue(iter->event_mod, &tr->mod_events, list) + return iter; +#endif + + /* + * The iter is allocated in s_start() and passed via the 'v' + * parameter. To stop the iterator, NULL must be returned. But + * the return value is what the 'v' parameter in s_stop() receives + * and frees. Free iter here as it will no longer be used. + */ + kfree(iter); return NULL; } static void *s_start(struct seq_file *m, loff_t *pos) { - struct trace_event_file *file; struct trace_array *tr = m->private; + struct set_event_iter *iter; loff_t l; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + mutex_lock(&event_mutex); - file = list_entry(&tr->events, struct trace_event_file, list); + iter->type = SET_EVENT_FILE; + iter->file = list_entry(&tr->events, struct trace_event_file, list); + for (l = 0; l <= *pos; ) { - file = s_next(m, file, &l); - if (!file) + iter = s_next(m, iter, &l); + if (!iter) break; } - return file; + return iter; } static int t_show(struct seq_file *m, void *v) @@ -1445,6 +1641,45 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&event_mutex); } +#ifdef CONFIG_MODULES +static int s_show(struct seq_file *m, void *v) +{ + struct set_event_iter *iter = v; + const char *system; + const char *event; + + if (iter->type == SET_EVENT_FILE) + return t_show(m, iter->file); + + /* When match is set, system and event are not */ + if (iter->event_mod->match) { + seq_printf(m, "%s:mod:%s\n", iter->event_mod->match, + iter->event_mod->module); + return 0; + } + + system = iter->event_mod->system ? : "*"; + event = iter->event_mod->event ? : "*"; + + seq_printf(m, "%s:%s:mod:%s\n", system, event, iter->event_mod->module); + + return 0; +} +#else /* CONFIG_MODULES */ +static int s_show(struct seq_file *m, void *v) +{ + struct set_event_iter *iter = v; + + return t_show(m, iter->file); +} +#endif + +static void s_stop(struct seq_file *m, void *v) +{ + kfree(v); + t_stop(m, NULL); +} + static void * __next(struct seq_file *m, void *v, loff_t *pos, int type) { @@ -1558,21 +1793,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; + guard(mutex)(&event_mutex); + switch (val) { case 0: case 1: - ret = -ENODEV; - mutex_lock(&event_mutex); file = event_file_file(filp); - if (likely(file)) { - ret = tracing_update_buffers(file->tr); - if (ret < 0) { - mutex_unlock(&event_mutex); - return ret; - } - ret = ftrace_event_enable_disable(file, val); - } - mutex_unlock(&event_mutex); + if (!file) + return -ENODEV; + ret = tracing_update_buffers(file->tr); + if (ret < 0) + return ret; + ret = ftrace_event_enable_disable(file, val); + if (ret < 0) + return ret; break; default: @@ -1581,7 +1815,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, *ppos += cnt; - return ret ? ret : cnt; + return cnt; } static ssize_t @@ -1659,7 +1893,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (system) name = system->name; - ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); + ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val, NULL); if (ret) goto out; @@ -2157,7 +2391,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, if (ret < 0) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); if (type == TRACE_PIDS) { filtered_pids = rcu_dereference_protected(tr->filtered_pids, @@ -2173,7 +2407,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) - goto out; + return ret; if (type == TRACE_PIDS) rcu_assign_pointer(tr->filtered_pids, pid_list); @@ -2198,11 +2432,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, */ on_each_cpu(ignore_task_cpu, tr, 1); - out: - mutex_unlock(&event_mutex); - - if (ret > 0) - *ppos += ret; + *ppos += ret; return ret; } @@ -2237,8 +2467,8 @@ static const struct seq_operations show_event_seq_ops = { static const struct seq_operations show_set_event_seq_ops = { .start = s_start, .next = s_next, - .show = t_show, - .stop = t_stop, + .show = s_show, + .stop = s_stop, }; static const struct seq_operations show_set_pid_seq_ops = { @@ -3111,6 +3341,20 @@ static bool event_in_systems(struct trace_event_call *call, return !*p || isspace(*p) || *p == ','; } +#ifdef CONFIG_HIST_TRIGGERS +/* + * Wake up waiter on the hist_poll_wq from irq_work because the hist trigger + * may happen in any context. + */ +static void hist_poll_event_irq_work(struct irq_work *work) +{ + wake_up_all(&hist_poll_wq); +} + +DEFINE_IRQ_WORK(hist_poll_work, hist_poll_event_irq_work); +DECLARE_WAIT_QUEUE_HEAD(hist_poll_wq); +#endif + static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) @@ -3269,13 +3513,13 @@ int trace_add_event_call(struct trace_event_call *call) int ret; lockdep_assert_held(&event_mutex); - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = __register_event(call, NULL); - if (ret >= 0) - __add_event_to_tracers(call); + if (ret < 0) + return ret; - mutex_unlock(&trace_types_lock); + __add_event_to_tracers(call); return ret; } EXPORT_SYMBOL_GPL(trace_add_event_call); @@ -3355,6 +3599,28 @@ EXPORT_SYMBOL_GPL(trace_remove_event_call); event++) #ifdef CONFIG_MODULES +static void update_mod_cache(struct trace_array *tr, struct module *mod) +{ + struct event_mod_load *event_mod, *n; + + list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) { + if (strcmp(event_mod->module, mod->name) != 0) + continue; + + __ftrace_set_clr_event_nolock(tr, event_mod->match, + event_mod->system, + event_mod->event, 1, mod->name); + free_event_mod(event_mod); + } +} + +static void update_cache_events(struct module *mod) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) + update_mod_cache(tr, mod); +} static void trace_module_add_events(struct module *mod) { @@ -3377,6 +3643,8 @@ static void trace_module_add_events(struct module *mod) __register_event(*call, mod); __add_event_to_tracers(*call); } + + update_cache_events(mod); } static void trace_module_remove_events(struct module *mod) @@ -3529,30 +3797,21 @@ struct trace_event_file *trace_get_event_file(const char *instance, return ERR_PTR(ret); } - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); file = find_event_file(tr, system, event); if (!file) { trace_array_put(tr); - ret = -EINVAL; - goto out; + return ERR_PTR(-EINVAL); } /* Don't let event modules unload while in use */ ret = trace_event_try_get_ref(file->event_call); if (!ret) { trace_array_put(tr); - ret = -EBUSY; - goto out; + return ERR_PTR(-EBUSY); } - ret = 0; - out: - mutex_unlock(&event_mutex); - - if (ret) - file = ERR_PTR(ret); - return file; } EXPORT_SYMBOL_GPL(trace_get_event_file); @@ -3770,6 +4029,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; + unsigned long count = -1; const char *system; const char *event; char *number; @@ -3789,12 +4049,11 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, event = strsep(¶m, ":"); - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); - ret = -EINVAL; file = find_event_file(tr, system, event); if (!file) - goto out; + return -EINVAL; enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; @@ -3803,74 +4062,62 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, else ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; - if (glob[0] == '!') { - ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); - goto out; - } - - ret = -ENOMEM; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - goto out; - - data->enable = enable; - data->count = -1; - data->file = file; - - if (!param) - goto out_reg; + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, tr, ops); - number = strsep(¶m, ":"); + if (param) { + number = strsep(¶m, ":"); - ret = -EINVAL; - if (!strlen(number)) - goto out_free; + if (!strlen(number)) + return -EINVAL; - /* - * We use the callback data field (which is a pointer) - * as our counter. - */ - ret = kstrtoul(number, 0, &data->count); - if (ret) - goto out_free; + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &count); + if (ret) + return ret; + } - out_reg: /* Don't let event modules unload while probe registered */ ret = trace_event_try_get_ref(file->event_call); - if (!ret) { - ret = -EBUSY; - goto out_free; - } + if (!ret) + return -EBUSY; ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out_put; + + data->enable = enable; + data->count = count; + data->file = file; + ret = register_ftrace_function_probe(glob, tr, ops, data); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. * Consider no functions a failure too. */ - if (!ret) { - ret = -ENOENT; - goto out_disable; - } else if (ret < 0) - goto out_disable; + /* Just return zero, not the number of enabled functions */ - ret = 0; - out: - mutex_unlock(&event_mutex); - return ret; + if (ret > 0) + return 0; + + kfree(data); + + if (!ret) + ret = -ENOENT; - out_disable: __ftrace_event_enable_disable(file, 0, 1); out_put: trace_event_put_ref(file->event_call); - out_free: - kfree(data); - goto out; + return ret; } static struct ftrace_func_command event_enable_cmd = { @@ -4093,20 +4340,17 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) - goto out_unlock; + return ret; down_write(&trace_event_sem); __trace_early_add_event_dirs(tr); up_write(&trace_event_sem); - out_unlock: - mutex_unlock(&event_mutex); - - return ret; + return 0; } /* Must be called with event_mutex held */ @@ -4121,7 +4365,7 @@ int event_trace_del_tracer(struct trace_array *tr) __ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS); /* Disable any running events */ - __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); + __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0, NULL); /* Make sure no more events are being executed */ tracepoint_synchronize_unregister(); @@ -4405,7 +4649,7 @@ static __init void event_trace_self_tests(void) pr_info("Testing event system %s: ", system->name); - ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling system %s\n", system->name); @@ -4414,7 +4658,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); - ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling system %s\n", system->name); @@ -4429,7 +4673,7 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); - ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error enabling all events\n"); return; @@ -4438,7 +4682,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); /* reset sysname */ - ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0, NULL); if (WARN_ON_ONCE(ret)) { pr_warn("error disabling all events\n"); return; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 78051de581e7..0993dfc1c5c1 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2405,13 +2405,11 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, struct event_filter *filter = NULL; int err = 0; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); /* Make sure the system still has events */ - if (!dir->nr_events) { - err = -ENODEV; - goto out_unlock; - } + if (!dir->nr_events) + return -ENODEV; if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(dir, tr); @@ -2422,7 +2420,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, tracepoint_synchronize_unregister(); filter_free_subsystem_filters(dir, tr); __free_filter(filter); - goto out_unlock; + return 0; } err = create_system_filter(dir, filter_string, &filter); @@ -2434,8 +2432,6 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, __free_filter(system->filter); system->filter = filter; } -out_unlock: - mutex_unlock(&event_mutex); return err; } @@ -2612,17 +2608,15 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, struct event_filter *filter = NULL; struct trace_event_call *call; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); call = event->tp_event; - err = -EINVAL; if (!call) - goto out_unlock; + return -EINVAL; - err = -EEXIST; if (event->filter) - goto out_unlock; + return -EEXIST; err = create_filter(NULL, call, filter_str, false, &filter); if (err) @@ -2637,9 +2631,6 @@ free_filter: if (err || ftrace_event_is_function(call)) __free_filter(filter); -out_unlock: - mutex_unlock(&event_mutex); - return err; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9c058aa8baf3..53dc6719181e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5311,6 +5311,8 @@ static void event_hist_trigger(struct event_trigger_data *data, if (resolve_var_refs(hist_data, key, var_ref_vals, true)) hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); + + hist_poll_wakeup(); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5590,49 +5592,137 @@ static void hist_trigger_show(struct seq_file *m, n_entries, (u64)atomic64_read(&hist_data->map->drops)); } +struct hist_file_data { + struct file *file; + u64 last_read; + u64 last_act; +}; + +static u64 get_hist_hit_count(struct trace_event_file *event_file) +{ + struct hist_trigger_data *hist_data; + struct event_trigger_data *data; + u64 ret = 0; + + list_for_each_entry(data, &event_file->triggers, list) { + if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) { + hist_data = data->private_data; + ret += atomic64_read(&hist_data->map->hits); + } + } + return ret; +} + static int hist_show(struct seq_file *m, void *v) { + struct hist_file_data *hist_file = m->private; struct event_trigger_data *data; struct trace_event_file *event_file; - int n = 0, ret = 0; + int n = 0; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); - event_file = event_file_file(m->private); - if (unlikely(!event_file)) { - ret = -ENODEV; - goto out_unlock; - } + event_file = event_file_file(hist_file->file); + if (unlikely(!event_file)) + return -ENODEV; list_for_each_entry(data, &event_file->triggers, list) { if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_show(m, data, n++); } + hist_file->last_read = get_hist_hit_count(event_file); + /* + * Update last_act too so that poll()/POLLPRI can wait for the next + * event after any syscall on hist file. + */ + hist_file->last_act = hist_file->last_read; - out_unlock: - mutex_unlock(&event_mutex); + return 0; +} + +static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wait) +{ + struct trace_event_file *event_file; + struct seq_file *m = file->private_data; + struct hist_file_data *hist_file = m->private; + __poll_t ret = 0; + u64 cnt; + + guard(mutex)(&event_mutex); + + event_file = event_file_data(file); + if (!event_file) + return EPOLLERR; + + hist_poll_wait(file, wait); + + cnt = get_hist_hit_count(event_file); + if (hist_file->last_read != cnt) + ret |= EPOLLIN | EPOLLRDNORM; + if (hist_file->last_act != cnt) { + hist_file->last_act = cnt; + ret |= EPOLLPRI; + } return ret; } +static int event_hist_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct hist_file_data *hist_file = m->private; + + kfree(hist_file); + return tracing_single_release_file_tr(inode, file); +} + static int event_hist_open(struct inode *inode, struct file *file) { + struct trace_event_file *event_file; + struct hist_file_data *hist_file; int ret; ret = tracing_open_file_tr(inode, file); if (ret) return ret; + guard(mutex)(&event_mutex); + + event_file = event_file_data(file); + if (!event_file) { + ret = -ENODEV; + goto err; + } + + hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL); + if (!hist_file) { + ret = -ENOMEM; + goto err; + } + + hist_file->file = file; + hist_file->last_act = get_hist_hit_count(event_file); + /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; - return single_open(file, hist_show, file); + ret = single_open(file, hist_show, hist_file); + if (ret) { + kfree(hist_file); + goto err; + } + + return 0; +err: + tracing_release_file_tr(inode, file); + return ret; } const struct file_operations event_hist_fops = { .open = event_hist_open, .read = seq_read, .llseek = seq_lseek, - .release = tracing_single_release_file_tr, + .release = event_hist_release, + .poll = event_hist_poll, }; #ifdef CONFIG_HIST_TRIGGERS_DEBUG @@ -5873,25 +5963,19 @@ static int hist_debug_show(struct seq_file *m, void *v) { struct event_trigger_data *data; struct trace_event_file *event_file; - int n = 0, ret = 0; + int n = 0; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); event_file = event_file_file(m->private); - if (unlikely(!event_file)) { - ret = -ENODEV; - goto out_unlock; - } + if (unlikely(!event_file)) + return -ENODEV; list_for_each_entry(data, &event_file->triggers, list) { if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_debug_show(m, data, n++); } - - out_unlock: - mutex_unlock(&event_mutex); - - return ret; + return 0; } static int event_hist_debug_open(struct inode *inode, struct file *file) @@ -5904,7 +5988,10 @@ static int event_hist_debug_open(struct inode *inode, struct file *file) /* Clear private_data to avoid warning in single_open() */ file->private_data = NULL; - return single_open(file, hist_debug_show, file); + ret = single_open(file, hist_debug_show, file); + if (ret) + tracing_release_file_tr(inode, file); + return ret; } const struct file_operations event_hist_debug_fops = { @@ -6649,27 +6736,27 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, if (existing_hist_update_only(glob, trigger_data, file)) goto out_free; - ret = event_trigger_register(cmd_ops, file, glob, trigger_data); - if (ret < 0) - goto out_free; + if (!get_named_trigger_data(trigger_data)) { - if (get_named_trigger_data(trigger_data)) - goto enable; + ret = create_actions(hist_data); + if (ret) + goto out_free; - ret = create_actions(hist_data); - if (ret) - goto out_unreg; + if (has_hist_vars(hist_data) || hist_data->n_var_refs) { + ret = save_hist_vars(hist_data); + if (ret) + goto out_free; + } - if (has_hist_vars(hist_data) || hist_data->n_var_refs) { - ret = save_hist_vars(hist_data); + ret = tracing_map_init(hist_data->map); if (ret) - goto out_unreg; + goto out_free; } - ret = tracing_map_init(hist_data->map); - if (ret) - goto out_unreg; -enable: + ret = event_trigger_register(cmd_ops, file, glob, trigger_data); + if (ret < 0) + goto out_free; + ret = hist_trigger_enable(trigger_data, file); if (ret) goto out_unreg; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index c82b401a294d..e3f7d09e5512 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -49,16 +49,11 @@ static char *last_cmd; static int errpos(const char *str) { - int ret = 0; - - mutex_lock(&lastcmd_mutex); + guard(mutex)(&lastcmd_mutex); if (!str || !last_cmd) - goto out; + return 0; - ret = err_pos(last_cmd, str); - out: - mutex_unlock(&lastcmd_mutex); - return ret; + return err_pos(last_cmd, str); } static void last_cmd_set(const char *str) @@ -74,14 +69,12 @@ static void last_cmd_set(const char *str) static void synth_err(u8 err_type, u16 err_pos) { - mutex_lock(&lastcmd_mutex); + guard(mutex)(&lastcmd_mutex); if (!last_cmd) - goto out; + return; tracing_log_err(NULL, "synthetic_events", last_cmd, err_text, err_type, err_pos); - out: - mutex_unlock(&lastcmd_mutex); } static int create_synth_event(const char *raw_command); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index a5e3d6acf1e1..d45448947094 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -211,12 +211,10 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) if (ret) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); - if (unlikely(!event_file_file(file))) { - mutex_unlock(&event_mutex); + if (unlikely(!event_file_file(file))) return -ENODEV; - } if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { @@ -239,8 +237,6 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) } } - mutex_unlock(&event_mutex); - return ret; } @@ -248,7 +244,6 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) { char *command, *next; struct event_command *p; - int ret = -EINVAL; next = buff = skip_spaces(buff); command = strsep(&next, ": \t"); @@ -259,17 +254,14 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) } command = (command[0] != '!') ? command : command + 1; - mutex_lock(&trigger_cmd_mutex); + guard(mutex)(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { - if (strcmp(p->name, command) == 0) { - ret = p->parse(p, file, buff, command, next); - goto out_unlock; - } + if (strcmp(p->name, command) == 0) + return p->parse(p, file, buff, command, next); } - out_unlock: - mutex_unlock(&trigger_cmd_mutex); - return ret; + return -EINVAL; } static ssize_t event_trigger_regex_write(struct file *file, @@ -278,7 +270,7 @@ static ssize_t event_trigger_regex_write(struct file *file, { struct trace_event_file *event_file; ssize_t ret; - char *buf; + char *buf __free(kfree) = NULL; if (!cnt) return 0; @@ -292,24 +284,18 @@ static ssize_t event_trigger_regex_write(struct file *file, strim(buf); - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); + event_file = event_file_file(file); - if (unlikely(!event_file)) { - mutex_unlock(&event_mutex); - kfree(buf); + if (unlikely(!event_file)) return -ENODEV; - } - ret = trigger_process_regex(event_file, buf); - mutex_unlock(&event_mutex); - kfree(buf); + ret = trigger_process_regex(event_file, buf); if (ret < 0) - goto out; + return ret; *ppos += cnt; - ret = cnt; - out: - return ret; + return cnt; } static int event_trigger_regex_release(struct inode *inode, struct file *file) @@ -359,20 +345,16 @@ const struct file_operations event_trigger_fops = { __init int register_event_command(struct event_command *cmd) { struct event_command *p; - int ret = 0; - mutex_lock(&trigger_cmd_mutex); + guard(mutex)(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { - if (strcmp(cmd->name, p->name) == 0) { - ret = -EBUSY; - goto out_unlock; - } + if (strcmp(cmd->name, p->name) == 0) + return -EBUSY; } list_add(&cmd->list, &trigger_commands); - out_unlock: - mutex_unlock(&trigger_cmd_mutex); - return ret; + return 0; } /* @@ -382,20 +364,17 @@ __init int register_event_command(struct event_command *cmd) __init int unregister_event_command(struct event_command *cmd) { struct event_command *p, *n; - int ret = -ENODEV; - mutex_lock(&trigger_cmd_mutex); + guard(mutex)(&trigger_cmd_mutex); + list_for_each_entry_safe(p, n, &trigger_commands, list) { if (strcmp(cmd->name, p->name) == 0) { - ret = 0; list_del_init(&p->list); - goto out_unlock; + return 0; } } - out_unlock: - mutex_unlock(&trigger_cmd_mutex); - return ret; + return -ENODEV; } /** diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 17bcad8f79de..97325fbd6283 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -2899,7 +2899,7 @@ static int set_max_user_events_sysctl(const struct ctl_table *table, int write, return ret; } -static struct ctl_table user_event_sysctls[] = { +static const struct ctl_table user_event_sysctls[] = { { .procname = "user_events_max", .data = &max_user_events, diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index c62d1629cffe..985ff98272da 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -134,7 +134,7 @@ static int process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, void *dest, void *base) { - struct pt_regs *regs = rec; + struct ftrace_regs *fregs = rec; unsigned long val; int ret; @@ -142,17 +142,17 @@ retry: /* 1st stage: get value from context */ switch (code->op) { case FETCH_OP_STACK: - val = regs_get_kernel_stack_nth(regs, code->param); + val = ftrace_regs_get_kernel_stack_nth(fregs, code->param); break; case FETCH_OP_STACKP: - val = kernel_stack_pointer(regs); + val = ftrace_regs_get_stack_pointer(fregs); break; case FETCH_OP_RETVAL: - val = regs_return_value(regs); + val = ftrace_regs_get_return_value(fregs); break; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API case FETCH_OP_ARG: - val = regs_get_kernel_argument(regs, code->param); + val = ftrace_regs_get_argument(fregs, code->param); break; case FETCH_OP_EDATA: val = *(unsigned long *)((unsigned long)edata + code->offset); @@ -175,7 +175,7 @@ NOKPROBE_SYMBOL(process_fetch_insn) /* function entry handler */ static nokprobe_inline void __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - struct pt_regs *regs, + struct ftrace_regs *fregs, struct trace_event_file *trace_file) { struct fentry_trace_entry_head *entry; @@ -189,41 +189,71 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tf->tp, regs, NULL); + dsize = __get_data_size(&tf->tp, fregs, NULL); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tf->tp.size + dsize); if (!entry) return; - fbuffer.regs = regs; + fbuffer.regs = ftrace_get_regs(fregs); entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->ip = entry_ip; - store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } static void fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - struct pt_regs *regs) + struct ftrace_regs *fregs) { struct event_file_link *link; trace_probe_for_each_link_rcu(link, &tf->tp) - __fentry_trace_func(tf, entry_ip, regs, link->file); + __fentry_trace_func(tf, entry_ip, fregs, link->file); } NOKPROBE_SYMBOL(fentry_trace_func); +static nokprobe_inline +void store_fprobe_entry_data(void *edata, struct trace_probe *tp, struct ftrace_regs *fregs) +{ + struct probe_entry_arg *earg = tp->entry_arg; + unsigned long val = 0; + int i; + + if (!earg) + return; + + for (i = 0; i < earg->size; i++) { + struct fetch_insn *code = &earg->code[i]; + + switch (code->op) { + case FETCH_OP_ARG: + val = ftrace_regs_get_argument(fregs, code->param); + break; + case FETCH_OP_ST_EDATA: + *(unsigned long *)((unsigned long)edata + code->offset) = val; + break; + case FETCH_OP_END: + goto end; + default: + break; + } + } +end: + return; +} + /* function exit handler */ static int trace_fprobe_entry_handler(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); if (tf->tp.entry_arg) - store_trace_entry_data(entry_data, &tf->tp, regs); + store_fprobe_entry_data(entry_data, &tf->tp, fregs); return 0; } @@ -231,7 +261,7 @@ NOKPROBE_SYMBOL(trace_fprobe_entry_handler) static nokprobe_inline void __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data, struct trace_event_file *trace_file) { struct fexit_trace_entry_head *entry; @@ -245,60 +275,63 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tf->tp, regs, entry_data); + dsize = __get_data_size(&tf->tp, fregs, entry_data); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tf->tp.size + dsize); if (!entry) return; - fbuffer.regs = regs; + fbuffer.regs = ftrace_get_regs(fregs); entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = entry_ip; entry->ret_ip = ret_ip; - store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } static void fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, void *entry_data) + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct event_file_link *link; trace_probe_for_each_link_rcu(link, &tf->tp) - __fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data, link->file); + __fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data, link->file); } NOKPROBE_SYMBOL(fexit_trace_func); #ifdef CONFIG_PERF_EVENTS static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, - struct pt_regs *regs) + struct ftrace_regs *fregs) { struct trace_event_call *call = trace_probe_event_call(&tf->tp); struct fentry_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; + struct pt_regs *regs; int rctx; head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return 0; - dsize = __get_data_size(&tf->tp, regs, NULL); + dsize = __get_data_size(&tf->tp, fregs, NULL); __size = sizeof(*entry) + tf->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_alloc(size, NULL, &rctx); + entry = perf_trace_buf_alloc(size, ®s, &rctx); if (!entry) return 0; + regs = ftrace_fill_perf_regs(fregs, regs); + entry->ip = entry_ip; memset(&entry[1], 0, dsize); - store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); return 0; @@ -307,31 +340,34 @@ NOKPROBE_SYMBOL(fentry_perf_func); static void fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_event_call *call = trace_probe_event_call(&tf->tp); struct fexit_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; + struct pt_regs *regs; int rctx; head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; - dsize = __get_data_size(&tf->tp, regs, entry_data); + dsize = __get_data_size(&tf->tp, fregs, entry_data); __size = sizeof(*entry) + tf->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_alloc(size, NULL, &rctx); + entry = perf_trace_buf_alloc(size, ®s, &rctx); if (!entry) return; + regs = ftrace_fill_perf_regs(fregs, regs); + entry->func = entry_ip; entry->ret_ip = ret_ip; - store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); } @@ -339,33 +375,34 @@ NOKPROBE_SYMBOL(fexit_perf_func); #endif /* CONFIG_PERF_EVENTS */ static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); int ret = 0; if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) - fentry_trace_func(tf, entry_ip, regs); + fentry_trace_func(tf, entry_ip, fregs); + #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) - ret = fentry_perf_func(tf, entry_ip, regs); + ret = fentry_perf_func(tf, entry_ip, fregs); #endif return ret; } NOKPROBE_SYMBOL(fentry_dispatcher); static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) - fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data); + fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) - fexit_perf_func(tf, entry_ip, ret_ip, regs, entry_data); + fexit_perf_func(tf, entry_ip, ret_ip, fregs, entry_data); #endif } NOKPROBE_SYMBOL(fexit_dispatcher); @@ -379,6 +416,9 @@ static void free_trace_fprobe(struct trace_fprobe *tf) } } +/* Since alloc_trace_fprobe() can return error, check the pointer is ERR too. */ +DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) free_trace_fprobe(_T)) + /* * Allocate new trace_probe and initialize it (including fprobe). */ @@ -387,10 +427,9 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *symbol, struct tracepoint *tpoint, struct module *mod, - int maxactive, int nargs, bool is_return) { - struct trace_fprobe *tf; + struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; int ret = -ENOMEM; tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL); @@ -399,7 +438,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, tf->symbol = kstrdup(symbol, GFP_KERNEL); if (!tf->symbol) - goto error; + return ERR_PTR(-ENOMEM); if (is_return) tf->fp.exit_handler = fexit_dispatcher; @@ -408,17 +447,13 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, tf->tpoint = tpoint; tf->mod = mod; - tf->fp.nr_maxactive = maxactive; ret = trace_probe_init(&tf->tp, event, group, false, nargs); if (ret < 0) - goto error; + return ERR_PTR(ret); dyn_event_init(&tf->devent, &trace_fprobe_ops); - return tf; -error: - free_trace_fprobe(tf); - return ERR_PTR(ret); + return_ptr(tf); } static struct trace_fprobe *find_trace_fprobe(const char *event, @@ -845,14 +880,12 @@ static int register_trace_fprobe(struct trace_fprobe *tf) struct trace_fprobe *old_tf; int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), trace_probe_group_name(&tf->tp)); - if (old_tf) { - ret = append_trace_fprobe(tf, old_tf); - goto end; - } + if (old_tf) + return append_trace_fprobe(tf, old_tf); /* Register new event */ ret = register_fprobe_event(tf); @@ -862,7 +895,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf) trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); - goto end; + return ret; } /* Register fprobe */ @@ -872,8 +905,6 @@ static int register_trace_fprobe(struct trace_fprobe *tf) else dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); -end: - mutex_unlock(&event_mutex); return ret; } @@ -889,13 +920,8 @@ static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mo if (!data->tpoint && !strcmp(data->tp_name, tp->name)) { data->tpoint = tp; - if (!data->mod) { + if (!data->mod) data->mod = mod; - if (!try_module_get(data->mod)) { - data->tpoint = NULL; - data->mod = NULL; - } - } } } @@ -907,13 +933,7 @@ static void __find_tracepoint_cb(struct tracepoint *tp, void *priv) data->tpoint = tp; } -/* - * Find a tracepoint from kernel and module. If the tracepoint is in a module, - * this increments the module refcount to prevent unloading until the - * trace_fprobe is registered to the list. After registering the trace_fprobe - * on the trace_fprobe list, the module refcount is decremented because - * tracepoint_probe_module_cb will handle it. - */ +/* Find a tracepoint from kernel and module. */ static struct tracepoint *find_tracepoint(const char *tp_name, struct module **tp_mod) { @@ -942,6 +962,7 @@ static void reenable_trace_fprobe(struct trace_fprobe *tf) } } +/* Find a tracepoint from specified module. */ static struct tracepoint *find_tracepoint_in_module(struct module *mod, const char *tp_name) { @@ -977,10 +998,13 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, reenable_trace_fprobe(tf); } } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) { - tracepoint_probe_unregister(tf->tpoint, + unregister_fprobe(&tf->fp); + if (trace_fprobe_is_tracepoint(tf)) { + tracepoint_probe_unregister(tf->tpoint, tf->tpoint->probestub, NULL); - tf->tpoint = NULL; - tf->mod = NULL; + tf->tpoint = TRACEPOINT_STUB; + tf->mod = NULL; + } } } mutex_unlock(&event_mutex); @@ -1018,6 +1042,19 @@ static int parse_symbol_and_return(int argc, const char *argv[], if (*is_return) return 0; + if (is_tracepoint) { + tmp = *symbol; + while (*tmp && (isalnum(*tmp) || *tmp == '_')) + tmp++; + if (*tmp) { + /* find a wrong character. */ + trace_probe_log_err(tmp - *symbol, BAD_TP_NAME); + kfree(*symbol); + *symbol = NULL; + return -EINVAL; + } + } + /* If there is $retval, this should be a return fprobe. */ for (i = 2; i < argc; i++) { tmp = strstr(argv[i], "$retval"); @@ -1025,6 +1062,8 @@ static int parse_symbol_and_return(int argc, const char *argv[], if (is_tracepoint) { trace_probe_log_set_index(i); trace_probe_log_err(tmp - argv[i], RETVAL_ON_PROBE); + kfree(*symbol); + *symbol = NULL; return -EINVAL; } *is_return = true; @@ -1034,7 +1073,10 @@ static int parse_symbol_and_return(int argc, const char *argv[], return 0; } -static int __trace_fprobe_create(int argc, const char *argv[]) +DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T)) + +static int trace_fprobe_create_internal(int argc, const char *argv[], + struct traceprobe_parse_context *ctx) { /* * Argument syntax: @@ -1060,24 +1102,20 @@ static int __trace_fprobe_create(int argc, const char *argv[]) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_fprobe *tf = NULL; - int i, len, new_argc = 0, ret = 0; + struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; + int i, new_argc = 0, ret = 0; bool is_return = false; - char *symbol = NULL; + char *symbol __free(kfree) = NULL; const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; - const char **new_argv = NULL; - int maxactive = 0; + const char **new_argv __free(kfree) = NULL; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char sbuf[KSYM_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; - char *dbuf = NULL; + char *dbuf __free(kfree) = NULL; bool is_tracepoint = false; - struct module *tp_mod = NULL; + struct module *tp_mod __free(module_put) = NULL; struct tracepoint *tpoint = NULL; - struct traceprobe_parse_context ctx = { - .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, - }; if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; @@ -1087,35 +1125,13 @@ static int __trace_fprobe_create(int argc, const char *argv[]) group = TRACEPOINT_EVENT_SYSTEM; } - trace_probe_log_init("trace_fprobe", argc, argv); - - event = strchr(&argv[0][1], ':'); - if (event) - event++; - - if (isdigit(argv[0][1])) { - if (event) - len = event - &argv[0][1] - 1; - else - len = strlen(&argv[0][1]); - if (len > MAX_EVENT_NAME_LEN - 1) { - trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; - } - memcpy(buf, &argv[0][1], len); - buf[len] = '\0'; - ret = kstrtouint(buf, 0, &maxactive); - if (ret || !maxactive) { + if (argv[0][1] != '\0') { + if (argv[0][1] != ':') { + trace_probe_log_set_index(0); trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; - } - /* fprobe rethook instances are iterated over via a list. The - * maximum should stay reasonable. - */ - if (maxactive > RETHOOK_MAXACTIVE_MAX) { - trace_probe_log_err(1, MAXACT_TOO_BIG); - goto parse_error; + return -EINVAL; } + event = &argv[0][2]; } trace_probe_log_set_index(1); @@ -1123,20 +1139,14 @@ static int __trace_fprobe_create(int argc, const char *argv[]) /* a symbol(or tracepoint) must be specified */ ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint); if (ret < 0) - goto parse_error; - - if (!is_return && maxactive) { - trace_probe_log_set_index(0); - trace_probe_log_err(1, BAD_MAXACT_TYPE); - goto parse_error; - } + return -EINVAL; trace_probe_log_set_index(0); if (event) { ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto parse_error; + return -EINVAL; } if (!event) { @@ -1152,78 +1162,83 @@ static int __trace_fprobe_create(int argc, const char *argv[]) } if (is_return) - ctx.flags |= TPARG_FL_RETURN; + ctx->flags |= TPARG_FL_RETURN; else - ctx.flags |= TPARG_FL_FENTRY; + ctx->flags |= TPARG_FL_FENTRY; if (is_tracepoint) { - ctx.flags |= TPARG_FL_TPOINT; + ctx->flags |= TPARG_FL_TPOINT; tpoint = find_tracepoint(symbol, &tp_mod); + /* lock module until register this tprobe. */ + if (tp_mod && !try_module_get(tp_mod)) { + tpoint = NULL; + tp_mod = NULL; + } if (tpoint) { - ctx.funcname = kallsyms_lookup( + ctx->funcname = kallsyms_lookup( (unsigned long)tpoint->probestub, NULL, NULL, NULL, sbuf); } else if (IS_ENABLED(CONFIG_MODULES)) { /* This *may* be loaded afterwards */ tpoint = TRACEPOINT_STUB; - ctx.funcname = symbol; + ctx->funcname = symbol; } else { trace_probe_log_set_index(1); trace_probe_log_err(0, NO_TRACEPOINT); - goto parse_error; + return -EINVAL; } } else - ctx.funcname = symbol; + ctx->funcname = symbol; argc -= 2; argv += 2; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, - abuf, MAX_BTF_ARGS_LEN, &ctx); - if (IS_ERR(new_argv)) { - ret = PTR_ERR(new_argv); - new_argv = NULL; - goto out; - } + abuf, MAX_BTF_ARGS_LEN, ctx); + if (IS_ERR(new_argv)) + return PTR_ERR(new_argv); if (new_argv) { argc = new_argc; argv = new_argv; } - if (argc > MAX_TRACE_ARGS) { - ret = -E2BIG; - goto out; - } + if (argc > MAX_TRACE_ARGS) + return -E2BIG; ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) - goto out; + return ret; /* setup a probe */ tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod, - maxactive, argc, is_return); + argc, is_return); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto out; /* We know tf is not allocated */ + return ret; } /* parse arguments */ for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); - ctx.offset = 0; - ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); + ctx->offset = 0; + ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], ctx); if (ret) - goto error; /* This can be -ENOMEM */ + return ret; /* This can be -ENOMEM */ } if (is_return && tf->tp.entry_arg) { tf->fp.entry_handler = trace_fprobe_entry_handler; tf->fp.entry_data_size = traceprobe_get_entry_data_size(&tf->tp); + if (ALIGN(tf->fp.entry_data_size, sizeof(long)) > MAX_FPROBE_DATA_SIZE) { + trace_probe_log_set_index(2); + trace_probe_log_err(0, TOO_MANY_EARGS); + return -E2BIG; + } } ret = traceprobe_set_print_fmt(&tf->tp, is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL); if (ret < 0) - goto error; + return ret; ret = register_trace_fprobe(tf); if (ret) { @@ -1234,29 +1249,32 @@ static int __trace_fprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_PROBE_ADDR); else if (ret != -ENOMEM && ret != -EEXIST) trace_probe_log_err(0, FAIL_REG_PROBE); - goto error; + return -EINVAL; } -out: - if (tp_mod) - module_put(tp_mod); + /* 'tf' is successfully registered. To avoid freeing, assign NULL. */ + tf = NULL; + + return 0; +} + +static int trace_fprobe_create_cb(int argc, const char *argv[]) +{ + struct traceprobe_parse_context ctx = { + .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, + }; + int ret; + + trace_probe_log_init("trace_fprobe", argc, argv); + ret = trace_fprobe_create_internal(argc, argv, &ctx); traceprobe_finish_parse(&ctx); trace_probe_log_clear(); - kfree(new_argv); - kfree(symbol); - kfree(dbuf); return ret; - -parse_error: - ret = -EINVAL; -error: - free_trace_fprobe(tf); - goto out; } static int trace_fprobe_create(const char *raw_command) { - return trace_probe_create(raw_command, __trace_fprobe_create); + return trace_probe_create(raw_command, trace_fprobe_create_cb); } static int trace_fprobe_release(struct dyn_event *ev) @@ -1278,8 +1296,6 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) seq_putc(m, 't'); else seq_putc(m, 'f'); - if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive) - seq_printf(m, "%d", tf->fp.nr_maxactive); seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp), trace_probe_name(&tf->tp)); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index d358c9935164..df56f9b76010 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -216,7 +216,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, parent_ip = function_get_true_parent_ip(parent_ip, fregs); - trace_ctx = tracing_gen_ctx(); + trace_ctx = tracing_gen_ctx_dec(); data = this_cpu_ptr(tr->array_buffer.data); if (!atomic_read(&data->disabled)) @@ -321,7 +321,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned int trace_ctx; - unsigned long flags; int bit; if (unlikely(!tr->function_enabled)) @@ -347,8 +346,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (is_repeat_check(tr, last_info, ip, parent_ip)) goto out; - local_save_flags(flags); - trace_ctx = tracing_gen_ctx_flags(flags); + trace_ctx = tracing_gen_ctx_dec(); process_repeats(tr, ip, parent_ip, last_info, trace_ctx); trace_function(tr, ip, parent_ip, trace_ctx); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 5504b5e4e7b4..136c750b0b4d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -175,16 +175,16 @@ struct fgraph_times { }; int trace_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; struct trace_array_cpu *data; struct fgraph_times *ftimes; - unsigned long flags; unsigned int trace_ctx; long disabled; - int ret; + int ret = 0; int cpu; if (*task_var & TRACE_GRAPH_NOTRACE) @@ -198,7 +198,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, * returning from the function. */ if (ftrace_graph_notrace_addr(trace->func)) { - *task_var |= TRACE_GRAPH_NOTRACE_BIT; + *task_var |= TRACE_GRAPH_NOTRACE; /* * Need to return 1 to have the return called * that will clear the NOTRACE bit. @@ -235,25 +235,21 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, if (tracing_thresh) return 1; - local_irq_save(flags); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - trace_ctx = tracing_gen_ctx_flags(flags); - if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && - tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR))) { + disabled = atomic_read(&data->disabled); + if (likely(!disabled)) { + trace_ctx = tracing_gen_ctx(); + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && + tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { unsigned long retaddr = ftrace_graph_top_ret_addr(current); - ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); - } else + } else { ret = __trace_graph_entry(tr, trace, trace_ctx); - } else { - ret = 0; + } } - - atomic_dec(&data->disabled); - local_irq_restore(flags); + preempt_enable_notrace(); return ret; } @@ -270,12 +266,10 @@ __trace_graph_function(struct trace_array *tr, struct ftrace_graph_ret ret = { .func = ip, .depth = 0, - .calltime = time, - .rettime = time, }; __trace_graph_entry(tr, &ent, trace_ctx); - __trace_graph_return(tr, &ret, trace_ctx); + __trace_graph_return(tr, &ret, trace_ctx, time, time); } void @@ -287,8 +281,9 @@ trace_graph_function(struct trace_array *tr, } void __trace_graph_return(struct trace_array *tr, - struct ftrace_graph_ret *trace, - unsigned int trace_ctx) + struct ftrace_graph_ret *trace, + unsigned int trace_ctx, + u64 calltime, u64 rettime) { struct ring_buffer_event *event; struct trace_buffer *buffer = tr->array_buffer.buffer; @@ -300,6 +295,8 @@ void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; + entry->calltime = calltime; + entry->rettime = rettime; trace_buffer_unlock_commit_nostack(buffer, event); } @@ -314,18 +311,20 @@ static void handle_nosleeptime(struct ftrace_graph_ret *trace, } void trace_graph_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, struct ftrace_regs *fregs) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; struct trace_array_cpu *data; struct fgraph_times *ftimes; - unsigned long flags; unsigned int trace_ctx; + u64 calltime, rettime; long disabled; int size; int cpu; + rettime = trace_clock_local(); + ftrace_graph_addr_finish(gops, trace); if (*task_var & TRACE_GRAPH_NOTRACE) { @@ -339,22 +338,22 @@ void trace_graph_return(struct ftrace_graph_ret *trace, handle_nosleeptime(trace, ftimes, size); - trace->calltime = ftimes->calltime; + calltime = ftimes->calltime; - local_irq_save(flags); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - trace_ctx = tracing_gen_ctx_flags(flags); - __trace_graph_return(tr, trace, trace_ctx); + disabled = atomic_read(&data->disabled); + if (likely(!disabled)) { + trace_ctx = tracing_gen_ctx(); + __trace_graph_return(tr, trace, trace_ctx, calltime, rettime); } - atomic_dec(&data->disabled); - local_irq_restore(flags); + preempt_enable_notrace(); } static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct fgraph_times *ftimes; int size; @@ -372,13 +371,11 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, handle_nosleeptime(trace, ftimes, size); - trace->calltime = ftimes->calltime; - if (tracing_thresh && - (trace->rettime - ftimes->calltime < tracing_thresh)) + (trace_clock_local() - ftimes->calltime < tracing_thresh)) return; else - trace_graph_return(trace, gops); + trace_graph_return(trace, gops, fregs); } static struct fgraph_ops funcgraph_ops = { @@ -861,7 +858,7 @@ print_graph_entry_leaf(struct trace_iterator *iter, graph_ret = &ret_entry->ret; call = &entry->graph_ent; - duration = graph_ret->rettime - graph_ret->calltime; + duration = ret_entry->rettime - ret_entry->calltime; func = call->func + iter->tr->text_delta; @@ -1142,11 +1139,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, } static enum print_line_t -print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, +print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s, struct trace_entry *ent, struct trace_iterator *iter, u32 flags) { - unsigned long long duration = trace->rettime - trace->calltime; + struct ftrace_graph_ret *trace = &retentry->ret; + u64 calltime = retentry->calltime; + u64 rettime = retentry->rettime; + unsigned long long duration = rettime - calltime; struct fgraph_data *data = iter->private; struct trace_array *tr = iter->tr; unsigned long func; @@ -1347,7 +1347,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) case TRACE_GRAPH_RET: { struct ftrace_graph_ret_entry *field; trace_assign_type(field, entry); - return print_graph_return(&field->ret, s, entry, iter, flags); + return print_graph_return(field, s, entry, iter, flags); } case TRACE_STACK: case TRACE_FN: diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a4e799c1e767..7294ad676379 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -176,7 +176,8 @@ static int irqsoff_display_graph(struct trace_array *tr, int set) } static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; @@ -214,13 +215,15 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, } static void irqsoff_graph_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; unsigned int trace_ctx; u64 *calltime; + u64 rettime; int size; ftrace_graph_addr_finish(gops, trace); @@ -228,13 +231,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace, if (!func_prolog_dec(tr, &data, &flags)) return; + rettime = trace_clock_local(); calltime = fgraph_retrieve_data(gops->idx, &size); if (!calltime) return; - trace->calltime = *calltime; trace_ctx = tracing_gen_ctx_flags(flags); - __trace_graph_return(tr, trace, trace_ctx); + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); atomic_dec(&data->disabled); } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0642ea174849..d8d5f18a141a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "trace_kprobe: " fmt #include <linux/bpf-cgroup.h> +#include <linux/cleanup.h> #include <linux/security.h> #include <linux/module.h> #include <linux/uaccess.h> @@ -257,6 +258,9 @@ static void free_trace_kprobe(struct trace_kprobe *tk) } } +DEFINE_FREE(free_trace_kprobe, struct trace_kprobe *, + if (!IS_ERR_OR_NULL(_T)) free_trace_kprobe(_T)) + /* * Allocate new trace_probe and initialize it (including kprobes). */ @@ -268,7 +272,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, int maxactive, int nargs, bool is_return) { - struct trace_kprobe *tk; + struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; int ret = -ENOMEM; tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL); @@ -277,12 +281,12 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, tk->nhit = alloc_percpu(unsigned long); if (!tk->nhit) - goto error; + return ERR_PTR(ret); if (symbol) { tk->symbol = kstrdup(symbol, GFP_KERNEL); if (!tk->symbol) - goto error; + return ERR_PTR(ret); tk->rp.kp.symbol_name = tk->symbol; tk->rp.kp.offset = offs; } else @@ -299,13 +303,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, ret = trace_probe_init(&tk->tp, event, group, false, nargs); if (ret < 0) - goto error; + return ERR_PTR(ret); dyn_event_init(&tk->devent, &trace_kprobe_ops); - return tk; -error: - free_trace_kprobe(tk); - return ERR_PTR(ret); + return_ptr(tk); } static struct trace_kprobe *find_trace_kprobe(const char *event, @@ -634,7 +635,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) struct trace_kprobe *old_tk; int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); old_tk = find_trace_kprobe(trace_probe_name(&tk->tp), trace_probe_group_name(&tk->tp)); @@ -642,11 +643,9 @@ static int register_trace_kprobe(struct trace_kprobe *tk) if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) { trace_probe_log_set_index(0); trace_probe_log_err(0, DIFF_PROBE_TYPE); - ret = -EEXIST; - } else { - ret = append_trace_kprobe(tk, old_tk); + return -EEXIST; } - goto end; + return append_trace_kprobe(tk, old_tk); } /* Register new event */ @@ -657,7 +656,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); - goto end; + return ret; } /* Register k*probe */ @@ -672,8 +671,6 @@ static int register_trace_kprobe(struct trace_kprobe *tk) else dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp)); -end: - mutex_unlock(&event_mutex); return ret; } @@ -706,7 +703,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; /* Update probes on coming module */ - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); for_each_trace_kprobe(tk, pos) { if (trace_kprobe_within_module(tk, mod)) { /* Don't need to check busy - this should have gone. */ @@ -718,7 +715,6 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, module_name(mod), ret); } } - mutex_unlock(&event_mutex); return NOTIFY_DONE; } @@ -840,7 +836,8 @@ out: static int trace_kprobe_entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs); -static int __trace_kprobe_create(int argc, const char *argv[]) +static int trace_kprobe_create_internal(int argc, const char *argv[], + struct traceprobe_parse_context *ctx) { /* * Argument syntax: @@ -866,11 +863,12 @@ static int __trace_kprobe_create(int argc, const char *argv[]) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_kprobe *tk = NULL; + struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; int i, len, new_argc = 0, ret = 0; bool is_return = false; - char *symbol = NULL, *tmp = NULL; - const char **new_argv = NULL; + char *symbol __free(kfree) = NULL; + char *tmp = NULL; + const char **new_argv __free(kfree) = NULL; const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; enum probe_print_type ptype; int maxactive = 0; @@ -879,8 +877,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; - char *dbuf = NULL; - struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; + char *dbuf __free(kfree) = NULL; switch (argv[0][0]) { case 'r': @@ -894,8 +891,6 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (argc < 2) return -ECANCELED; - trace_probe_log_init("trace_kprobe", argc, argv); - event = strchr(&argv[0][1], ':'); if (event) event++; @@ -903,7 +898,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (isdigit(argv[0][1])) { if (!is_return) { trace_probe_log_err(1, BAD_MAXACT_TYPE); - goto parse_error; + return -EINVAL; } if (event) len = event - &argv[0][1] - 1; @@ -911,21 +906,21 @@ static int __trace_kprobe_create(int argc, const char *argv[]) len = strlen(&argv[0][1]); if (len > MAX_EVENT_NAME_LEN - 1) { trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; + return -EINVAL; } memcpy(buf, &argv[0][1], len); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; + return -EINVAL; } /* kretprobes instances are iterated over via a list. The * maximum should stay reasonable. */ if (maxactive > KRETPROBE_MAXACTIVE_MAX) { trace_probe_log_err(1, MAXACT_TOO_BIG); - goto parse_error; + return -EINVAL; } } @@ -934,16 +929,13 @@ static int __trace_kprobe_create(int argc, const char *argv[]) if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { trace_probe_log_set_index(1); /* Check whether uprobe event specified */ - if (strchr(argv[1], '/') && strchr(argv[1], ':')) { - ret = -ECANCELED; - goto error; - } + if (strchr(argv[1], '/') && strchr(argv[1], ':')) + return -ECANCELED; + /* a symbol specified */ symbol = kstrdup(argv[1], GFP_KERNEL); - if (!symbol) { - ret = -ENOMEM; - goto error; - } + if (!symbol) + return -ENOMEM; tmp = strchr(symbol, '%'); if (tmp) { @@ -952,7 +944,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) is_return = true; } else { trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX); - goto parse_error; + return -EINVAL; } } @@ -960,7 +952,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret || offset < 0 || offset > UINT_MAX) { trace_probe_log_err(0, BAD_PROBE_ADDR); - goto parse_error; + return -EINVAL; } ret = validate_probe_symbol(symbol); if (ret) { @@ -968,17 +960,17 @@ static int __trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, NON_UNIQ_SYMBOL); else trace_probe_log_err(0, BAD_PROBE_ADDR); - goto parse_error; + return -EINVAL; } if (is_return) - ctx.flags |= TPARG_FL_RETURN; + ctx->flags |= TPARG_FL_RETURN; ret = kprobe_on_func_entry(NULL, symbol, offset); if (ret == 0 && !is_return) - ctx.flags |= TPARG_FL_FENTRY; + ctx->flags |= TPARG_FL_FENTRY; /* Defer the ENOENT case until register kprobe */ if (ret == -EINVAL && is_return) { trace_probe_log_err(0, BAD_RETPROBE); - goto parse_error; + return -EINVAL; } } @@ -987,7 +979,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto parse_error; + return ret; } if (!event) { @@ -1003,26 +995,24 @@ static int __trace_kprobe_create(int argc, const char *argv[]) } argc -= 2; argv += 2; - ctx.funcname = symbol; + ctx->funcname = symbol; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, - abuf, MAX_BTF_ARGS_LEN, &ctx); + abuf, MAX_BTF_ARGS_LEN, ctx); if (IS_ERR(new_argv)) { ret = PTR_ERR(new_argv); new_argv = NULL; - goto out; + return ret; } if (new_argv) { argc = new_argc; argv = new_argv; } - if (argc > MAX_TRACE_ARGS) { - ret = -E2BIG; - goto out; - } + if (argc > MAX_TRACE_ARGS) + return -E2BIG; ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) - goto out; + return ret; /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, @@ -1031,16 +1021,16 @@ static int __trace_kprobe_create(int argc, const char *argv[]) ret = PTR_ERR(tk); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto out; /* We know tk is not allocated */ + return ret; /* We know tk is not allocated */ } /* parse arguments */ for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); - ctx.offset = 0; - ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx); + ctx->offset = 0; + ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], ctx); if (ret) - goto error; /* This can be -ENOMEM */ + return ret; /* This can be -ENOMEM */ } /* entry handler for kretprobe */ if (is_return && tk->tp.entry_arg) { @@ -1051,7 +1041,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; ret = traceprobe_set_print_fmt(&tk->tp, ptype); if (ret < 0) - goto error; + return ret; ret = register_trace_kprobe(tk); if (ret) { @@ -1062,27 +1052,34 @@ static int __trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_PROBE_ADDR); else if (ret != -ENOMEM && ret != -EEXIST) trace_probe_log_err(0, FAIL_REG_PROBE); - goto error; + return ret; } + /* + * Here, 'tk' has been registered to the list successfully, + * so we don't need to free it. + */ + tk = NULL; + + return 0; +} + +static int trace_kprobe_create_cb(int argc, const char *argv[]) +{ + struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; + int ret; + + trace_probe_log_init("trace_kprobe", argc, argv); + + ret = trace_kprobe_create_internal(argc, argv, &ctx); -out: traceprobe_finish_parse(&ctx); trace_probe_log_clear(); - kfree(new_argv); - kfree(symbol); - kfree(dbuf); return ret; - -parse_error: - ret = -EINVAL; -error: - free_trace_kprobe(tk); - goto out; } static int trace_kprobe_create(const char *raw_command) { - return trace_probe_create(raw_command, __trace_kprobe_create); + return trace_probe_create(raw_command, trace_kprobe_create_cb); } static int create_or_delete_trace_kprobe(const char *raw_command) @@ -1898,7 +1895,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, bool is_return) { enum probe_print_type ptype; - struct trace_kprobe *tk; + struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; int ret; char *event; @@ -1929,19 +1926,14 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, ptype = trace_kprobe_is_return(tk) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; - if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) { - ret = -ENOMEM; - goto error; - } + if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) + return ERR_PTR(-ENOMEM); ret = __register_trace_kprobe(tk); if (ret < 0) - goto error; + return ERR_PTR(ret); - return trace_probe_event_call(&tk->tp); -error: - free_trace_kprobe(tk); - return ERR_PTR(ret); + return trace_probe_event_call(&(no_free_ptr(tk)->tp)); } void destroy_local_trace_kprobe(struct trace_event_call *event_call) @@ -1970,13 +1962,12 @@ static __init void enable_boot_kprobe_events(void) struct trace_kprobe *tk; struct dyn_event *pos; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); for_each_trace_kprobe(tk, pos) { list_for_each_entry(file, &tr->events, list) if (file->event_call == trace_probe_event_call(&tk->tp)) trace_event_enable_disable(file, 1, 0); } - mutex_unlock(&event_mutex); } static __init void setup_boot_kprobe_events(void) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index b9f96c77527d..92e16f03fa4e 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1229,6 +1229,8 @@ static void trace_sched_migrate_callback(void *data, struct task_struct *p, int } } +static bool monitor_enabled; + static int register_migration_monitor(void) { int ret = 0; @@ -1237,16 +1239,25 @@ static int register_migration_monitor(void) * Timerlat thread migration check is only required when running timerlat in user-space. * Thus, enable callback only if timerlat is set with no workload. */ - if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) + if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) { + if (WARN_ON_ONCE(monitor_enabled)) + return 0; + ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + if (!ret) + monitor_enabled = true; + } return ret; } static void unregister_migration_monitor(void) { - if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) - unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + if (!monitor_enabled) + return; + + unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL); + monitor_enabled = false; } #else static int register_migration_monitor(void) @@ -1531,27 +1542,25 @@ static int run_osnoise(void) /* * In some cases, notably when running on a nohz_full CPU with - * a stopped tick PREEMPT_RCU has no way to account for QSs. - * This will eventually cause unwarranted noise as PREEMPT_RCU - * will force preemption as the means of ending the current - * grace period. We avoid this problem by calling - * rcu_momentary_eqs(), which performs a zero duration - * EQS allowing PREEMPT_RCU to end the current grace period. - * This call shouldn't be wrapped inside an RCU critical - * section. + * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to + * account for QSs. This will eventually cause unwarranted + * noise as RCU forces preemption as the means of ending the + * current grace period. We avoid this by calling + * rcu_momentary_eqs(), which performs a zero duration EQS + * allowing RCU to end the current grace period. This call + * shouldn't be wrapped inside an RCU critical section. * - * Note that in non PREEMPT_RCU kernels QSs are handled through - * cond_resched() + * Normally QSs for other cases are handled through cond_resched(). + * For simplicity, however, we call rcu_momentary_eqs() for all + * configurations here. */ - if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { - if (!disable_irq) - local_irq_disable(); + if (!disable_irq) + local_irq_disable(); - rcu_momentary_eqs(); + rcu_momentary_eqs(); - if (!disable_irq) - local_irq_enable(); - } + if (!disable_irq) + local_irq_enable(); /* * For the non-preemptive kernel config: let threads runs, if @@ -1890,8 +1899,7 @@ static int timerlat_main(void *data) tlat->count = 0; tlat->tracing_thread = false; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); tlat->kthread = current; osn_var->pid = current->pid; /* @@ -2083,26 +2091,21 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy) { unsigned int cpu = smp_processor_id(); - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (!osnoise_has_registered_instances()) - goto out_unlock_trace; + return; - mutex_lock(&interface_lock); - cpus_read_lock(); + guard(mutex)(&interface_lock); + guard(cpus_read_lock)(); if (!cpu_online(cpu)) - goto out_unlock; + return; + if (!cpumask_test_cpu(cpu, &osnoise_cpumask)) - goto out_unlock; + return; start_kthread(cpu); - -out_unlock: - cpus_read_unlock(); - mutex_unlock(&interface_lock); -out_unlock_trace: - mutex_unlock(&trace_types_lock); } static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn); @@ -2300,31 +2303,22 @@ static ssize_t osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - char *mask_str; + char *mask_str __free(kfree) = NULL; int len; - mutex_lock(&interface_lock); + guard(mutex)(&interface_lock); len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1; mask_str = kmalloc(len, GFP_KERNEL); - if (!mask_str) { - count = -ENOMEM; - goto out_unlock; - } + if (!mask_str) + return -ENOMEM; len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)); - if (len >= count) { - count = -EINVAL; - goto out_free; - } + if (len >= count) + return -EINVAL; count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); -out_free: - kfree(mask_str); -out_unlock: - mutex_unlock(&interface_lock); - return count; } @@ -2459,8 +2453,7 @@ static int timerlat_fd_open(struct inode *inode, struct file *file) tlat = this_cpu_tmr_var(); tlat->count = 0; - hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); - tlat->timer.function = timerlat_irq; + hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); migrate_enable(); return 0; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 16a5e368e7b7..8f58ee1e8858 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1409,7 +1409,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, struct traceprobe_parse_context *ctx) { struct fetch_insn *code, *tmp = NULL; - char *type, *arg; + char *type, *arg __free(kfree) = NULL; int ret, len; len = strlen(argv); @@ -1426,22 +1426,16 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size, return -ENOMEM; parg->comm = kstrdup(arg, GFP_KERNEL); - if (!parg->comm) { - ret = -ENOMEM; - goto out; - } + if (!parg->comm) + return -ENOMEM; type = parse_probe_arg_type(arg, parg, ctx); - if (IS_ERR(type)) { - ret = PTR_ERR(type); - goto out; - } + if (IS_ERR(type)) + return PTR_ERR(type); code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); - if (!code) { - ret = -ENOMEM; - goto out; - } + if (!code) + return -ENOMEM; code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ctx->last_type = NULL; @@ -1497,8 +1491,6 @@ fail: kfree(code->data); } kfree(tmp); -out: - kfree(arg); return ret; } @@ -1668,7 +1660,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], { const struct btf_param *params = NULL; int i, j, n, used, ret, args_idx = -1; - const char **new_argv = NULL; + const char **new_argv __free(kfree) = NULL; ret = argv_has_var_arg(argc, argv, &args_idx, ctx); if (ret < 0) @@ -1707,7 +1699,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], ret = sprint_nth_btf_arg(n, "", buf + used, bufsize - used, ctx); if (ret < 0) - goto error; + return ERR_PTR(ret); new_argv[j++] = buf + used; used += ret + 1; @@ -1721,25 +1713,20 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], n = simple_strtoul(argv[i] + 4, &type, 10); if (type && !(*type == ':' || *type == '\0')) { trace_probe_log_err(0, BAD_VAR); - ret = -ENOENT; - goto error; + return ERR_PTR(-ENOENT); } /* Note: $argN starts from $arg1 */ ret = sprint_nth_btf_arg(n - 1, type, buf + used, bufsize - used, ctx); if (ret < 0) - goto error; + return ERR_PTR(ret); new_argv[j++] = buf + used; used += ret + 1; } else new_argv[j++] = argv[i]; } - return new_argv; - -error: - kfree(new_argv); - return ERR_PTR(ret); + return_ptr(new_argv); } /* @buf: *buf must be equal to NULL. Caller must to free *buf */ @@ -1747,14 +1734,14 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) { int i, used, ret; const int bufsize = MAX_DENTRY_ARGS_LEN; - char *tmpbuf = NULL; + char *tmpbuf __free(kfree) = NULL; if (*buf) return -EINVAL; used = 0; for (i = 0; i < argc; i++) { - char *tmp; + char *tmp __free(kfree) = NULL; char *equal; size_t arg_len; @@ -1769,7 +1756,7 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) tmp = kstrdup(argv[i], GFP_KERNEL); if (!tmp) - goto nomem; + return -ENOMEM; equal = strchr(tmp, '='); if (equal) @@ -1790,18 +1777,14 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) offsetof(struct file, f_path.dentry), equal ? equal + 1 : tmp); - kfree(tmp); if (ret >= bufsize - used) - goto nomem; + return -ENOMEM; argv[i] = tmpbuf + used; used += ret + 1; } - *buf = tmpbuf; + *buf = no_free_ptr(tmpbuf); return 0; -nomem: - kfree(tmpbuf); - return -ENOMEM; } void traceprobe_finish_parse(struct traceprobe_parse_context *ctx) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5803e6a41570..96792bc4b092 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -36,7 +36,6 @@ #define MAX_BTF_ARGS_LEN 128 #define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX -#define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" @@ -481,6 +480,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ C(NO_TRACEPOINT, "Tracepoint is not found"), \ + C(BAD_TP_NAME, "Invalid character in tracepoint name"),\ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ C(NO_GROUP_NAME, "Group name is not specified"), \ C(GROUP_TOO_LONG, "Group name is too long"), \ @@ -544,7 +544,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(NO_BTF_FIELD, "This field is not found."), \ C(BAD_BTF_TID, "Failed to get BTF type info."),\ C(BAD_TYPE4STR, "This type does not fit for string."),\ - C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"), + C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\ + C(TOO_MANY_EARGS, "Too many entry arguments specified"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 2caf0d2afb32..f39b37fcdb3b 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -232,7 +232,7 @@ array: /* Sum up total data length for dynamic arrays (strings) */ static nokprobe_inline int -__get_data_size(struct trace_probe *tp, struct pt_regs *regs, void *edata) +__get_data_size(struct trace_probe *tp, void *regs, void *edata) { struct probe_arg *arg; int i, len, ret = 0; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index c58292e424d5..af30586f1aea 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -113,7 +113,8 @@ static int wakeup_display_graph(struct trace_array *tr, int set) } static int wakeup_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; @@ -150,12 +151,14 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace, } static void wakeup_graph_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned int trace_ctx; u64 *calltime; + u64 rettime; int size; ftrace_graph_addr_finish(gops, trace); @@ -163,12 +166,13 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace, if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return; + rettime = trace_clock_local(); + calltime = fgraph_retrieve_data(gops->idx, &size); if (!calltime) return; - trace->calltime = *calltime; - __trace_graph_return(tr, trace, trace_ctx); + __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime); atomic_dec(&data->disabled); preempt_enable_notrace(); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 38b5754790c9..d88c44f1dfa5 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -774,7 +774,8 @@ struct fgraph_fixture { }; static __init int store_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops); const char *type = fixture->store_type_name; @@ -807,7 +808,8 @@ static __init int store_entry(struct ftrace_graph_ent *trace, } static __init void store_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops); const char *type = fixture->store_type_name; @@ -1025,7 +1027,8 @@ static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { /* This is harmlessly racy, we want to approximately detect a hang */ if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { @@ -1039,7 +1042,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace, return 0; } - return trace_graph_entry(trace, gops); + return trace_graph_entry(trace, gops, fregs); } static struct fgraph_ops fgraph_ops __initdata = { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 7f9572a37333..14c6f272c4d8 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -520,20 +520,18 @@ stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer, int was_enabled; int ret; - mutex_lock(&stack_sysctl_mutex); + guard(mutex)(&stack_sysctl_mutex); was_enabled = !!stack_tracer_enabled; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (was_enabled == !!stack_tracer_enabled)) - goto out; + return ret; if (stack_tracer_enabled) register_ftrace_function(&trace_ops); else unregister_ftrace_function(&trace_ops); - out: - mutex_unlock(&stack_sysctl_mutex); return ret; } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index bb247beec447..b3b5586f104d 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -128,7 +128,7 @@ static int stat_seq_init(struct stat_session *session) int ret = 0; int i; - mutex_lock(&session->stat_mutex); + guard(mutex)(&session->stat_mutex); __reset_stat_session(session); if (!ts->stat_cmp) @@ -136,11 +136,11 @@ static int stat_seq_init(struct stat_session *session) stat = ts->stat_start(ts); if (!stat) - goto exit; + return 0; ret = insert_stat(root, stat, ts->stat_cmp); if (ret) - goto exit; + return ret; /* * Iterate over the tracer stat entries and store them in an rbtree. @@ -157,13 +157,10 @@ static int stat_seq_init(struct stat_session *session) goto exit_free_rbtree; } -exit: - mutex_unlock(&session->stat_mutex); return ret; exit_free_rbtree: __reset_stat_session(session); - mutex_unlock(&session->stat_mutex); return ret; } @@ -308,7 +305,7 @@ static int init_stat_file(struct stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { struct stat_session *session, *node; - int ret = -EINVAL; + int ret; if (!trace) return -EINVAL; @@ -316,18 +313,18 @@ int register_stat_tracer(struct tracer_stat *trace) if (!trace->stat_start || !trace->stat_next || !trace->stat_show) return -EINVAL; + guard(mutex)(&all_stat_sessions_mutex); + /* Already registered? */ - mutex_lock(&all_stat_sessions_mutex); list_for_each_entry(node, &all_stat_sessions, session_list) { if (node->ts == trace) - goto out; + return -EINVAL; } - ret = -ENOMEM; /* Init the session */ session = kzalloc(sizeof(*session), GFP_KERNEL); if (!session) - goto out; + return -ENOMEM; session->ts = trace; INIT_LIST_HEAD(&session->session_list); @@ -336,16 +333,13 @@ int register_stat_tracer(struct tracer_stat *trace) ret = init_stat_file(session); if (ret) { destroy_session(session); - goto out; + return ret; } - ret = 0; /* Register */ list_add_tail(&session->session_list, &all_stat_sessions); - out: - mutex_unlock(&all_stat_sessions_mutex); - return ret; + return 0; } void unregister_stat_tracer(struct tracer_stat *trace) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 4875e7f5de3d..ccc762fbb69c 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -498,11 +498,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu) struct trace_uprobe *old_tu; int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); ret = validate_ref_ctr_offset(tu); if (ret) - goto end; + return ret; /* register as an event */ old_tu = find_probe_event(trace_probe_name(&tu->tp), @@ -511,11 +511,9 @@ static int register_trace_uprobe(struct trace_uprobe *tu) if (is_ret_probe(tu) != is_ret_probe(old_tu)) { trace_probe_log_set_index(0); trace_probe_log_err(0, DIFF_PROBE_TYPE); - ret = -EEXIST; - } else { - ret = append_trace_uprobe(tu, old_tu); + return -EEXIST; } - goto end; + return append_trace_uprobe(tu, old_tu); } ret = register_uprobe_event(tu); @@ -525,14 +523,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu) trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); - goto end; + return ret; } dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp)); -end: - mutex_unlock(&event_mutex); - return ret; } diff --git a/kernel/ucount.c b/kernel/ucount.c index f950b5e59d63..86c5f1c0bad9 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -164,8 +164,8 @@ struct ucounts *get_ucounts(struct ucounts *ucounts) struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); - struct ucounts *ucounts, *new; bool wrapped; + struct ucounts *ucounts, *new = NULL; spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); @@ -182,17 +182,17 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); - if (ucounts) { - kfree(new); - } else { + if (!ucounts) { hlist_add_head(&new->node, hashent); get_user_ns(new->ns); spin_unlock_irq(&ucounts_lock); return new; } } + wrapped = !get_ucounts_or_wrap(ucounts); spin_unlock_irq(&ucounts_lock); + kfree(new); if (wrapped) { put_ucounts(ucounts); return NULL; diff --git a/kernel/umh.c b/kernel/umh.c index be9234270777..b4da45a3a7cf 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -544,7 +544,7 @@ static int proc_cap_handler(const struct ctl_table *table, int write, return 0; } -static struct ctl_table usermodehelper_table[] = { +static const struct ctl_table usermodehelper_table[] = { { .procname = "bset", .data = &usermodehelper_bset, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index aa0b2e47f2f2..682f40d5632d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -238,7 +238,7 @@ EXPORT_SYMBOL(__put_user_ns); struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ - u32 count; /* == 0 unless used with map_id_range_down() */ + u32 count; }; /* @@ -343,16 +343,19 @@ u32 map_id_down(struct uid_gid_map *map, u32 id) * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * -map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) +map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; - u32 first, last; + u32 first, last, id2; + + id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; - if (id >= first && id <= last) + if (id >= first && id <= last && + (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; @@ -363,28 +366,28 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * -map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) +map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = true; - key.count = 1; + key.count = count; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } -u32 map_id_up(struct uid_gid_map *map, u32 id) +u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - extent = map_id_up_base(extents, map, id); + extent = map_id_range_up_base(extents, map, id, count); else - extent = map_id_up_max(extents, map, id); + extent = map_id_range_up_max(extents, map, id, count); /* Map the id or note failure */ if (extent) @@ -395,6 +398,11 @@ u32 map_id_up(struct uid_gid_map *map, u32 id) return id; } +u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + return map_id_range_up(map, id, 1); +} + /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 7282f61a8650..bfbaaecb1dd4 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -75,7 +75,7 @@ static DEFINE_CTL_TABLE_POLL(hostname_poll); static DEFINE_CTL_TABLE_POLL(domainname_poll); // Note: update 'enum uts_proc' to match any changes to this table -static struct ctl_table uts_kern_table[] = { +static const struct ctl_table uts_kern_table[] = { { .procname = "arch", .data = init_uts_ns.name.machine, @@ -129,7 +129,7 @@ static struct ctl_table uts_kern_table[] = { */ void uts_proc_notify(enum uts_proc proc) { - struct ctl_table *table = &uts_kern_table[proc]; + const struct ctl_table *table = &uts_kern_table[proc]; proc_sys_poll_notify(table->poll); } diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 8800f5acc007..2ef2e1b80091 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -133,7 +133,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL); if (!vtsk) - return NULL; + return ERR_PTR(-ENOMEM); init_completion(&vtsk->exited); mutex_init(&vtsk->exit_mutex); vtsk->data = arg; @@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); if (IS_ERR(tsk)) { kfree(vtsk); - return NULL; + return ERR_PTR(PTR_ERR(tsk)); } vtsk->task = tsk; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 5267adeaa403..7e45559521af 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -101,12 +101,11 @@ static bool post_one_notification(struct watch_queue *wqueue, struct pipe_inode_info *pipe = wqueue->pipe; struct pipe_buffer *buf; struct page *page; - unsigned int head, tail, mask, note, offset, len; + unsigned int head, tail, note, offset, len; bool done = false; spin_lock_irq(&pipe->rd_wait.lock); - mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; if (pipe_full(head, tail, pipe->ring_size)) @@ -124,7 +123,7 @@ static bool post_one_notification(struct watch_queue *wqueue, memcpy(p + offset, n, len); kunmap_atomic(p); - buf = &pipe->bufs[head & mask]; + buf = pipe_buf(pipe, head); buf->page = page; buf->private = (unsigned long)wqueue; buf->ops = &watch_queue_pipe_buf_ops; @@ -147,7 +146,7 @@ out: return done; lost: - buf = &pipe->bufs[(head - 1) & mask]; + buf = pipe_buf(pipe, head - 1); buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } @@ -269,6 +268,15 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) if (ret < 0) goto error; + /* + * pipe_resize_ring() does not update nr_accounted for watch_queue + * pipes, because the above vastly overprovisions. Set nr_accounted on + * and max_usage this pipe to the number that was actually charged to + * the user above via account_pipe_buffers. + */ + pipe->max_usage = nr_pages; + pipe->nr_accounted = nr_pages; + ret = -ENOMEM; pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 41e0f7e9fa35..9fa2af9dbf2c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -190,7 +190,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) * with printk_cpu_sync_get_irqsave() that we can still at least * get the message about the lockup out. */ - pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu); + pr_emerg("CPU%u: Watchdog detected hard LOCKUP on cpu %u\n", this_cpu, cpu); printk_cpu_sync_get_irqsave(flags); print_modules(); @@ -347,8 +347,6 @@ static int __init watchdog_thresh_setup(char *str) } __setup("watchdog_thresh=", watchdog_thresh_setup); -static void __lockup_detector_cleanup(void); - #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM enum stats_per_group { STATS_SYSTEM, @@ -797,8 +795,7 @@ static void watchdog_enable(unsigned int cpu) * Start the timer first to prevent the hardlockup watchdog triggering * before the timer has a chance to fire. */ - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - hrtimer->function = watchdog_timer_fn; + hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED_HARD); @@ -886,11 +883,6 @@ static void __lockup_detector_reconfigure(void) watchdog_hardlockup_start(); cpus_read_unlock(); - /* - * Must be called outside the cpus locked section to prevent - * recursive locking in the perf code. - */ - __lockup_detector_cleanup(); } void lockup_detector_reconfigure(void) @@ -940,24 +932,6 @@ static inline void lockup_detector_setup(void) } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -static void __lockup_detector_cleanup(void) -{ - lockdep_assert_held(&watchdog_mutex); - hardlockup_detector_perf_cleanup(); -} - -/** - * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes - * - * Caller must not hold the cpu hotplug rwsem. - */ -void lockup_detector_cleanup(void) -{ - mutex_lock(&watchdog_mutex); - __lockup_detector_cleanup(); - mutex_unlock(&watchdog_mutex); -} - /** * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) * @@ -1094,7 +1068,7 @@ static int proc_watchdog_cpumask(const struct ctl_table *table, int write, static const int sixty = 60; -static struct ctl_table watchdog_sysctls[] = { +static const struct ctl_table watchdog_sysctls[] = { { .procname = "watchdog", .data = &watchdog_user_enabled, diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 59c1d86a73a2..a78ff092d636 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -21,8 +21,6 @@ #include <linux/perf_event.h> static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -static DEFINE_PER_CPU(struct perf_event *, dead_event); -static struct cpumask dead_events_mask; static atomic_t watchdog_cpus = ATOMIC_INIT(0); @@ -146,6 +144,7 @@ static int hardlockup_detector_event_create(void) PTR_ERR(evt)); return PTR_ERR(evt); } + WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak"); this_cpu_write(watchdog_ev, evt); return 0; } @@ -181,37 +180,13 @@ void watchdog_hardlockup_disable(unsigned int cpu) if (event) { perf_event_disable(event); + perf_event_release_kernel(event); this_cpu_write(watchdog_ev, NULL); - this_cpu_write(dead_event, event); - cpumask_set_cpu(smp_processor_id(), &dead_events_mask); atomic_dec(&watchdog_cpus); } } /** - * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them - * - * Called from lockup_detector_cleanup(). Serialized by the caller. - */ -void hardlockup_detector_perf_cleanup(void) -{ - int cpu; - - for_each_cpu(cpu, &dead_events_mask) { - struct perf_event *event = per_cpu(dead_event, cpu); - - /* - * Required because for_each_cpu() reports unconditionally - * CPU0 as set on UP kernels. Sigh. - */ - if (event) - perf_event_release_kernel(event); - per_cpu(dead_event, cpu) = NULL; - } - cpumask_clear(&dead_events_mask); -} - -/** * hardlockup_detector_perf_stop - Globally stop watchdog events * * Special interface for x86 to handle the perf HT bug. diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9362484a653c..bfe030b443e2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2180,7 +2180,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ - kasan_record_aux_stack_noalloc(work); + kasan_record_aux_stack(work); /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); @@ -2254,8 +2254,10 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * queues a new work item to a wq after destroy_workqueue(wq). */ if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && - WARN_ON_ONCE(!is_chained_work(wq)))) + WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n", + work->func, wq->name))) { return; + } rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ @@ -3517,12 +3519,6 @@ repeat: } /* - * Put the reference grabbed by send_mayday(). @pool won't - * go away while we're still attached to it. - */ - put_pwq(pwq); - - /* * Leave this pool. Notify regular workers; otherwise, we end up * with 0 concurrency and stalling the execution. */ @@ -3532,6 +3528,12 @@ repeat: worker_detach_from_pool(rescuer); + /* + * Put the reference grabbed by send_mayday(). @pool might + * go away any time after it. + */ + put_pwq_unlocked(pwq); + raw_spin_lock_irq(&wq_mayday_lock); } @@ -7840,7 +7842,7 @@ static void __init wq_cpu_intensive_thresh_init(void) unsigned long thresh; unsigned long bogo; - pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); + pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release"); BUG_ON(IS_ERR(pwq_release_worker)); /* if the user set it to a specific value, keep it */ |