summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit_watch.c12
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arena.c9
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c11
-rw-r--r--kernel/bpf/bpf_iter.c13
-rw-r--r--kernel/bpf/bpf_lsm.c2
-rw-r--r--kernel/bpf/bpf_struct_ops.c135
-rw-r--r--kernel/bpf/btf.c154
-rw-r--r--kernel/bpf/cgroup.c33
-rw-r--r--kernel/bpf/core.c117
-rw-r--r--kernel/bpf/cpumap.c146
-rw-r--r--kernel/bpf/cpumask.c53
-rw-r--r--kernel/bpf/disasm.c16
-rw-r--r--kernel/bpf/hashtab.c111
-rw-r--r--kernel/bpf/helpers.c126
-rw-r--r--kernel/bpf/inode.c8
-rw-r--r--kernel/bpf/lpm_trie.c25
-rw-r--r--kernel/bpf/offload.c11
-rw-r--r--kernel/bpf/percpu_freelist.c113
-rw-r--r--kernel/bpf/percpu_freelist.h4
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c1
-rw-r--r--kernel/bpf/rqspinlock.c737
-rw-r--r--kernel/bpf/rqspinlock.h48
-rw-r--r--kernel/bpf/syscall.c77
-rw-r--r--kernel/bpf/verifier.c1610
-rw-r--r--kernel/capability.c16
-rw-r--r--kernel/cfi.c9
-rw-r--r--kernel/cgroup/cgroup-internal.h1
-rw-r--r--kernel/cgroup/cgroup-v1.c7
-rw-r--r--kernel/cgroup/cgroup.c6
-rw-r--r--kernel/cgroup/cpuset-v1.c49
-rw-r--r--kernel/cgroup/cpuset.c79
-rw-r--r--kernel/cgroup/legacy_freezer.c6
-rw-r--r--kernel/cgroup/misc.c16
-rw-r--r--kernel/cgroup/rstat.c116
-rw-r--r--kernel/configs/hardening.config2
-rw-r--r--kernel/context_tracking.c9
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/dma/direct.c28
-rw-r--r--kernel/entry/Makefile3
-rw-r--r--kernel/entry/common.c2
-rw-r--r--kernel/events/callchain.c38
-rw-r--r--kernel/events/core.c1083
-rw-r--r--kernel/events/hw_breakpoint.c5
-rw-r--r--kernel/events/ring_buffer.c3
-rw-r--r--kernel/events/uprobes.c12
-rw-r--r--kernel/exit.c64
-rw-r--r--kernel/fork.c29
-rw-r--r--kernel/futex/core.c21
-rw-r--r--kernel/iomem.c5
-rw-r--r--kernel/irq/Kconfig5
-rw-r--r--kernel/irq/chip.c77
-rw-r--r--kernel/irq/internals.h11
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/irq/irqdomain.c5
-rw-r--r--kernel/irq/manage.c7
-rw-r--r--kernel/irq/migration.c20
-rw-r--r--kernel/irq/msi.c21
-rw-r--r--kernel/jump_label.c31
-rw-r--r--kernel/kallsyms.c12
-rw-r--r--kernel/kcmp.c2
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/livepatch/core.c13
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lock_events_list.h33
-rw-r--r--kernel/locking/lockdep.c20
-rw-r--r--kernel/locking/locktorture.c57
-rw-r--r--kernel/locking/mcs_spinlock.h10
-rw-r--r--kernel/locking/mutex.c2
-rw-r--r--kernel/locking/qspinlock.c193
-rw-r--r--kernel/locking/qspinlock.h201
-rw-r--r--kernel/locking/rtmutex.c29
-rw-r--r--kernel/module/internal.h11
-rw-r--r--kernel/module/kallsyms.c73
-rw-r--r--kernel/module/main.c199
-rw-r--r--kernel/module/strict_rwx.c9
-rw-r--r--kernel/module/tracking.c2
-rw-r--r--kernel/module/tree_lookup.c8
-rw-r--r--kernel/module/version.c14
-rw-r--r--kernel/padata.c2
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c29
-rw-r--r--kernel/pid.c106
-rw-r--r--kernel/power/Kconfig3
-rw-r--r--kernel/power/energy_model.c67
-rw-r--r--kernel/power/hibernate.c27
-rw-r--r--kernel/power/snapshot.c16
-rw-r--r--kernel/power/suspend.c22
-rw-r--r--kernel/power/swap.c58
-rw-r--r--kernel/printk/internal.h1
-rw-r--r--kernel/printk/printk.c59
-rw-r--r--kernel/printk/printk_ringbuffer.c13
-rw-r--r--kernel/rcu/Kconfig35
-rw-r--r--kernel/rcu/Kconfig.debug18
-rw-r--r--kernel/rcu/rcu.h13
-rw-r--r--kernel/rcu/rcutorture.c124
-rw-r--r--kernel/rcu/refscale.c32
-rw-r--r--kernel/rcu/srcutiny.c20
-rw-r--r--kernel/rcu/srcutree.c207
-rw-r--r--kernel/rcu/tasks.h5
-rw-r--r--kernel/rcu/tiny.c29
-rw-r--r--kernel/rcu/tree.c73
-rw-r--r--kernel/rcu/tree_exp.h6
-rw-r--r--kernel/rcu/tree_nocb.h20
-rw-r--r--kernel/rcu/tree_plugin.h22
-rw-r--r--kernel/reboot.c1
-rw-r--r--kernel/rseq.c140
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/build_policy.c1
-rw-r--r--kernel/sched/build_utility.c4
-rw-r--r--kernel/sched/core.c172
-rw-r--r--kernel/sched/core_sched.c2
-rw-r--r--kernel/sched/deadline.c57
-rw-r--r--kernel/sched/debug.c18
-rw-r--r--kernel/sched/ext.c1087
-rw-r--r--kernel/sched/ext.h10
-rw-r--r--kernel/sched/ext_idle.c1171
-rw-r--r--kernel/sched/ext_idle.h35
-rw-r--r--kernel/sched/fair.c139
-rw-r--r--kernel/sched/rt.c14
-rw-r--r--kernel/sched/sched.h128
-rw-r--r--kernel/sched/stats.h2
-rw-r--r--kernel/sched/syscalls.c12
-rw-r--r--kernel/sched/topology.c45
-rw-r--r--kernel/seccomp.c49
-rw-r--r--kernel/signal.c110
-rw-r--r--kernel/softirq.c18
-rw-r--r--kernel/static_call_inline.c15
-rw-r--r--kernel/stop_machine.c1
-rw-r--r--kernel/sys.c19
-rw-r--r--kernel/sysctl.c359
-rw-r--r--kernel/time/Makefile6
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/hrtimer.c34
-rw-r--r--kernel/time/namespace.c24
-rw-r--r--kernel/time/ntp.c3
-rw-r--r--kernel/time/posix-clock.c27
-rw-r--r--kernel/time/posix-timers.c558
-rw-r--r--kernel/time/sched_clock.c3
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c3
-rw-r--r--kernel/time/tick-sched.c6
-rw-r--r--kernel/time/timekeeping.c94
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/time/vsyscall.c66
-rw-r--r--kernel/torture.c12
-rw-r--r--kernel/trace/Kconfig12
-rw-r--r--kernel/trace/bpf_trace.c59
-rw-r--r--kernel/trace/fgraph.c2
-rw-r--r--kernel/trace/ftrace.c57
-rw-r--r--kernel/trace/ring_buffer.c249
-rw-r--r--kernel/trace/rv/Kconfig7
-rw-r--r--kernel/trace/rv/Makefile7
-rw-r--r--kernel/trace/rv/monitors/sched/Kconfig11
-rw-r--r--kernel/trace/rv/monitors/sched/sched.c38
-rw-r--r--kernel/trace/rv/monitors/sched/sched.h3
-rw-r--r--kernel/trace/rv/monitors/sco/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/sco/sco.c88
-rw-r--r--kernel/trace/rv/monitors/sco/sco.h47
-rw-r--r--kernel/trace/rv/monitors/sco/sco_trace.h15
-rw-r--r--kernel/trace/rv/monitors/scpd/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.c96
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.h49
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sncid/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid.c96
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid.h49
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid_trace.h15
-rw-r--r--kernel/trace/rv/monitors/snep/Kconfig15
-rw-r--r--kernel/trace/rv/monitors/snep/snep.c96
-rw-r--r--kernel/trace/rv/monitors/snep/snep.h49
-rw-r--r--kernel/trace/rv/monitors/snep/snep_trace.h15
-rw-r--r--kernel/trace/rv/monitors/snroc/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.c85
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.h47
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc_trace.h15
-rw-r--r--kernel/trace/rv/monitors/tss/Kconfig14
-rw-r--r--kernel/trace/rv/monitors/tss/tss.c91
-rw-r--r--kernel/trace/rv/monitors/tss/tss.h47
-rw-r--r--kernel/trace/rv/monitors/tss/tss_trace.h15
-rw-r--r--kernel/trace/rv/monitors/wip/Kconfig2
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c2
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h1
-rw-r--r--kernel/trace/rv/monitors/wwnr/Kconfig2
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c2
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h1
-rw-r--r--kernel/trace/rv/rv.c154
-rw-r--r--kernel/trace/rv/rv.h4
-rw-r--r--kernel/trace/rv/rv_reactors.c28
-rw-r--r--kernel/trace/rv/rv_trace.h6
-rw-r--r--kernel/trace/trace.c421
-rw-r--r--kernel/trace/trace.h33
-rw-r--r--kernel/trace/trace_entries.h12
-rw-r--r--kernel/trace/trace_eprobe.c8
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events.c44
-rw-r--r--kernel/trace/trace_events_hist.c20
-rw-r--r--kernel/trace/trace_events_synth.c40
-rw-r--r--kernel/trace/trace_events_trigger.c38
-rw-r--r--kernel/trace/trace_events_user.c7
-rw-r--r--kernel/trace/trace_fprobe.c35
-rw-r--r--kernel/trace/trace_functions.c46
-rw-r--r--kernel/trace/trace_functions_graph.c176
-rw-r--r--kernel/trace/trace_irqsoff.c14
-rw-r--r--kernel/trace/trace_kprobe.c14
-rw-r--r--kernel/trace/trace_osnoise.c94
-rw-r--r--kernel/trace/trace_output.c126
-rw-r--r--kernel/trace/trace_output.h9
-rw-r--r--kernel/trace/trace_probe.c28
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_wakeup.c6
-rw-r--r--kernel/trace/trace_uprobe.c9
-rw-r--r--kernel/tracepoint.c2
-rw-r--r--kernel/user_namespace.c26
-rw-r--r--kernel/watch_queue.c16
-rw-r--r--kernel/watchdog.c28
-rw-r--r--kernel/watchdog_perf.c29
220 files changed, 9971 insertions, 4727 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 87866b037fbe..434929de17ef 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,11 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif
+# Branch profiling isn't noinstr-safe
+ifdef CONFIG_TRACE_BRANCH_PROFILING
+CFLAGS_context_tracking.o += -DDISABLE_BRANCH_PROFILING
+endif
+
# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
# in coverage traces.
KCOV_INSTRUMENT_softirq.o := n
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 7f358740e958..367eaf2c78b7 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -350,11 +350,10 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
struct dentry *d = kern_path_locked(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
- if (d_is_positive(d)) {
- /* update watch filter fields */
- watch->dev = d->d_sb->s_dev;
- watch->ino = d_backing_inode(d)->i_ino;
- }
+ /* update watch filter fields */
+ watch->dev = d->d_sb->s_dev;
+ watch->ino = d_backing_inode(d)->i_ino;
+
inode_unlock(d_backing_inode(parent->dentry));
dput(d);
return 0;
@@ -419,10 +418,11 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
/* caller expects mutex locked */
mutex_lock(&audit_filter_mutex);
- if (ret) {
+ if (ret && ret != -ENOENT) {
audit_put_watch(watch);
return ret;
}
+ ret = 0;
/* either find an old parent or attach a new one */
parent = audit_find_parent(d_backing_inode(parent_path.dentry));
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9c853cde9abe..78fd876a5473 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2207,10 +2207,8 @@ __audit_reusename(const __user char *uptr)
list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
- if (n->name->uptr == uptr) {
- atomic_inc(&n->name->refcnt);
- return n->name;
- }
+ if (n->name->uptr == uptr)
+ return refname(n->name);
}
return NULL;
}
@@ -2237,7 +2235,7 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
- atomic_inc(&name->refcnt);
+ refname(name);
}
static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2369,7 +2367,7 @@ out_alloc:
return;
if (name) {
n->name = name;
- atomic_inc(&name->refcnt);
+ refname(name);
}
out:
@@ -2496,7 +2494,7 @@ void __audit_inode_child(struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- atomic_inc(&found_child->name->refcnt);
+ refname(found_child->name);
}
}
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 410028633621..70502f038b92 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
-obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o
ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o
endif
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 095a9554e1de..0d56cea71602 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
return VM_FAULT_SIGSEGV;
/* Account into memcg of the process that created bpf_arena */
- ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
+ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
return VM_FAULT_SIGSEGV;
@@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
if (ret)
goto out_free_pages;
- ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
- node_id, page_cnt, pages);
+ ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
if (ret)
goto out;
@@ -577,8 +576,8 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(arena_kfuncs)
-BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
-BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2)
+BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2)
BTF_KFUNCS_END(arena_kfuncs)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 54ff2a85d4c0..148da8f7ff36 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -161,6 +161,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
+ bool nobusy;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
@@ -169,21 +170,21 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!cgroup)
return (unsigned long)NULL;
- if (!bpf_cgrp_storage_trylock())
- return (unsigned long)NULL;
+ nobusy = bpf_cgrp_storage_trylock();
- sdata = cgroup_storage_lookup(cgroup, map, true);
+ sdata = cgroup_storage_lookup(cgroup, map, nobusy);
if (sdata)
goto unlock;
/* only allocate new storage, when the cgroup is refcounted */
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy)
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
value, BPF_NOEXIST, false, gfp_flags);
unlock:
- bpf_cgrp_storage_unlock();
+ if (nobusy)
+ bpf_cgrp_storage_unlock();
return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 106735145948..380e9a7cac75 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -335,7 +335,7 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo,
tinfo->btf_id = prog->aux->attach_btf_id;
}
-bool bpf_iter_prog_supported(struct bpf_prog *prog)
+int bpf_iter_prog_supported(struct bpf_prog *prog)
{
const char *attach_fname = prog->aux->attach_func_name;
struct bpf_iter_target_info *tinfo = NULL, *iter;
@@ -344,7 +344,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
int prefix_len = strlen(prefix);
if (strncmp(attach_fname, prefix, prefix_len))
- return false;
+ return -EINVAL;
mutex_lock(&targets_mutex);
list_for_each_entry(iter, &targets, list) {
@@ -360,12 +360,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
}
mutex_unlock(&targets_mutex);
- if (tinfo) {
- prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
- prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
- }
+ if (!tinfo)
+ return -EINVAL;
- return tinfo != NULL;
+ return bpf_prog_ctx_arg_info_init(prog, tinfo->reg_info->ctx_arg_info,
+ tinfo->reg_info->ctx_arg_info_size);
}
const struct bpf_func_proto *
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 967492b65185..0a59df1c550a 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -316,7 +316,9 @@ BTF_ID(func, bpf_lsm_inode_getxattr)
BTF_ID(func, bpf_lsm_inode_mknod)
BTF_ID(func, bpf_lsm_inode_need_killpriv)
BTF_ID(func, bpf_lsm_inode_post_setxattr)
+BTF_ID(func, bpf_lsm_inode_post_removexattr)
BTF_ID(func, bpf_lsm_inode_readlink)
+BTF_ID(func, bpf_lsm_inode_removexattr)
BTF_ID(func, bpf_lsm_inode_rename)
BTF_ID(func, bpf_lsm_inode_rmdir)
BTF_ID(func, bpf_lsm_inode_setattr)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 040fb1cd840b..db13ee70d94d 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -146,39 +146,7 @@ void bpf_struct_ops_image_free(void *image)
}
#define MAYBE_NULL_SUFFIX "__nullable"
-#define MAX_STUB_NAME 128
-
-/* Return the type info of a stub function, if it exists.
- *
- * The name of a stub function is made up of the name of the struct_ops and
- * the name of the function pointer member, separated by "__". For example,
- * if the struct_ops type is named "foo_ops" and the function pointer
- * member is named "bar", the stub function name would be "foo_ops__bar".
- */
-static const struct btf_type *
-find_stub_func_proto(const struct btf *btf, const char *st_op_name,
- const char *member_name)
-{
- char stub_func_name[MAX_STUB_NAME];
- const struct btf_type *func_type;
- s32 btf_id;
- int cp;
-
- cp = snprintf(stub_func_name, MAX_STUB_NAME, "%s__%s",
- st_op_name, member_name);
- if (cp >= MAX_STUB_NAME) {
- pr_warn("Stub function name too long\n");
- return NULL;
- }
- btf_id = btf_find_by_name_kind(btf, stub_func_name, BTF_KIND_FUNC);
- if (btf_id < 0)
- return NULL;
- func_type = btf_type_by_id(btf, btf_id);
- if (!func_type)
- return NULL;
-
- return btf_type_by_id(btf, func_type->type); /* FUNC_PROTO */
-}
+#define REFCOUNTED_SUFFIX "__ref"
/* Prepare argument info for every nullable argument of a member of a
* struct_ops type.
@@ -203,27 +171,44 @@ find_stub_func_proto(const struct btf *btf, const char *st_op_name,
static int prepare_arg_info(struct btf *btf,
const char *st_ops_name,
const char *member_name,
- const struct btf_type *func_proto,
+ const struct btf_type *func_proto, void *stub_func_addr,
struct bpf_struct_ops_arg_info *arg_info)
{
const struct btf_type *stub_func_proto, *pointed_type;
+ bool is_nullable = false, is_refcounted = false;
const struct btf_param *stub_args, *args;
struct bpf_ctx_arg_aux *info, *info_buf;
u32 nargs, arg_no, info_cnt = 0;
+ char ksym[KSYM_SYMBOL_LEN];
+ const char *stub_fname;
+ const char *suffix;
+ s32 stub_func_id;
u32 arg_btf_id;
int offset;
- stub_func_proto = find_stub_func_proto(btf, st_ops_name, member_name);
- if (!stub_func_proto)
- return 0;
+ stub_fname = kallsyms_lookup((unsigned long)stub_func_addr, NULL, NULL, NULL, ksym);
+ if (!stub_fname) {
+ pr_warn("Cannot find the stub function name for the %s in struct %s\n",
+ member_name, st_ops_name);
+ return -ENOENT;
+ }
+
+ stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC);
+ if (stub_func_id < 0) {
+ pr_warn("Cannot find the stub function %s in btf\n", stub_fname);
+ return -ENOENT;
+ }
+
+ stub_func_proto = btf_type_by_id(btf, stub_func_id);
+ stub_func_proto = btf_type_by_id(btf, stub_func_proto->type);
/* Check if the number of arguments of the stub function is the same
* as the number of arguments of the function pointer.
*/
nargs = btf_type_vlen(func_proto);
if (nargs != btf_type_vlen(stub_func_proto)) {
- pr_warn("the number of arguments of the stub function %s__%s does not match the number of arguments of the member %s of struct %s\n",
- st_ops_name, member_name, member_name, st_ops_name);
+ pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n",
+ stub_fname, member_name, st_ops_name);
return -EINVAL;
}
@@ -241,10 +226,18 @@ static int prepare_arg_info(struct btf *btf,
info = info_buf;
for (arg_no = 0; arg_no < nargs; arg_no++) {
/* Skip arguments that is not suffixed with
- * "__nullable".
+ * "__nullable or __ref".
*/
- if (!btf_param_match_suffix(btf, &stub_args[arg_no],
- MAYBE_NULL_SUFFIX))
+ is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no],
+ MAYBE_NULL_SUFFIX);
+ is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no],
+ REFCOUNTED_SUFFIX);
+
+ if (is_nullable)
+ suffix = MAYBE_NULL_SUFFIX;
+ else if (is_refcounted)
+ suffix = REFCOUNTED_SUFFIX;
+ else
continue;
/* Should be a pointer to struct */
@@ -253,30 +246,34 @@ static int prepare_arg_info(struct btf *btf,
&arg_btf_id);
if (!pointed_type ||
!btf_type_is_struct(pointed_type)) {
- pr_warn("stub function %s__%s has %s tagging to an unsupported type\n",
- st_ops_name, member_name, MAYBE_NULL_SUFFIX);
+ pr_warn("stub function %s has %s tagging to an unsupported type\n",
+ stub_fname, suffix);
goto err_out;
}
offset = btf_ctx_arg_offset(btf, func_proto, arg_no);
if (offset < 0) {
- pr_warn("stub function %s__%s has an invalid trampoline ctx offset for arg#%u\n",
- st_ops_name, member_name, arg_no);
+ pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n",
+ stub_fname, arg_no);
goto err_out;
}
if (args[arg_no].type != stub_args[arg_no].type) {
- pr_warn("arg#%u type in stub function %s__%s does not match with its original func_proto\n",
- arg_no, st_ops_name, member_name);
+ pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n",
+ arg_no, stub_fname);
goto err_out;
}
/* Fill the information of the new argument */
- info->reg_type =
- PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
info->btf_id = arg_btf_id;
info->btf = btf;
info->offset = offset;
+ if (is_nullable) {
+ info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
+ } else if (is_refcounted) {
+ info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID;
+ info->refcounted = true;
+ }
info++;
info_cnt++;
@@ -324,6 +321,13 @@ static bool is_module_member(const struct btf *btf, u32 id)
return !strcmp(btf_name_by_offset(btf, t->name_off), "module");
}
+int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
+{
+ void *func_ptr = *(void **)(st_ops->cfi_stubs + moff);
+
+ return func_ptr ? 0 : -ENOTSUPP;
+}
+
int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
struct btf *btf,
struct bpf_verifier_log *log)
@@ -386,8 +390,11 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
st_ops_desc->value_type = btf_type_by_id(btf, value_id);
for_each_member(i, t, member) {
- const struct btf_type *func_proto;
+ const struct btf_type *func_proto, *ret_type;
+ void **stub_func_addr;
+ u32 moff;
+ moff = __btf_member_bit_offset(t, member) / 8;
mname = btf_name_by_offset(btf, member->name_off);
if (!*mname) {
pr_warn("anon member in struct %s is not supported\n",
@@ -413,9 +420,23 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
func_proto = btf_type_resolve_func_ptr(btf,
member->type,
NULL);
- if (!func_proto)
+
+ /* The member is not a function pointer or
+ * the function pointer is not supported.
+ */
+ if (!func_proto || bpf_struct_ops_supported(st_ops, moff))
continue;
+ if (func_proto->type) {
+ ret_type = btf_type_resolve_ptr(btf, func_proto->type, NULL);
+ if (ret_type && !__btf_type_is_struct(ret_type)) {
+ pr_warn("func ptr %s in struct %s returns non-struct pointer, which is not supported\n",
+ mname, st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+ }
+
if (btf_distill_func_proto(log, btf,
func_proto, mname,
&st_ops->func_models[i])) {
@@ -425,8 +446,9 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
goto errout;
}
+ stub_func_addr = *(void **)(st_ops->cfi_stubs + moff);
err = prepare_arg_info(btf, st_ops->name, mname,
- func_proto,
+ func_proto, stub_func_addr,
arg_info + i);
if (err)
goto errout;
@@ -1152,13 +1174,6 @@ void bpf_struct_ops_put(const void *kdata)
bpf_map_put(&st_map->map);
}
-int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff)
-{
- void *func_ptr = *(void **)(st_ops->cfi_stubs + moff);
-
- return func_ptr ? 0 : -ENOTSUPP;
-}
-
static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c3223e0db2f5..16ba36f34dfa 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -606,6 +606,7 @@ s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
spin_unlock_bh(&btf_idr_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(bpf_find_btf_id);
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
u32 id, u32 *res_id)
@@ -2575,7 +2576,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_kflag(t)) {
+ if (btf_type_kflag(t) && !btf_type_is_type_tag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
@@ -3332,6 +3333,8 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
u32 off, int sz, struct btf_field_info *info, u32 field_mask)
{
enum btf_field_type type;
+ const char *tag_value;
+ bool is_type_tag;
u32 res_id;
/* Permit modifiers on the pointer itself */
@@ -3341,19 +3344,20 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
if (!btf_type_is_ptr(t))
return BTF_FIELD_IGNORE;
t = btf_type_by_id(btf, t->type);
-
- if (!btf_type_is_type_tag(t))
+ is_type_tag = btf_type_is_type_tag(t) && !btf_type_kflag(t);
+ if (!is_type_tag)
return BTF_FIELD_IGNORE;
/* Reject extra tags */
if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
return -EINVAL;
- if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off)))
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ if (!strcmp("kptr_untrusted", tag_value))
type = BPF_KPTR_UNREF;
- else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
+ else if (!strcmp("kptr", tag_value))
type = BPF_KPTR_REF;
- else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off)))
+ else if (!strcmp("percpu_kptr", tag_value))
type = BPF_KPTR_PERCPU;
- else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off)))
+ else if (!strcmp("uptr", tag_value))
type = BPF_UPTR;
else
return -EINVAL;
@@ -3477,6 +3481,15 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_
goto end;
}
}
+ if (field_mask & BPF_RES_SPIN_LOCK) {
+ if (!strcmp(name, "bpf_res_spin_lock")) {
+ if (*seen_mask & BPF_RES_SPIN_LOCK)
+ return -E2BIG;
+ *seen_mask |= BPF_RES_SPIN_LOCK;
+ type = BPF_RES_SPIN_LOCK;
+ goto end;
+ }
+ }
if (field_mask & BPF_TIMER) {
if (!strcmp(name, "bpf_timer")) {
if (*seen_mask & BPF_TIMER)
@@ -3655,6 +3668,7 @@ static int btf_find_field_one(const struct btf *btf,
switch (field_type) {
case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
case BPF_TIMER:
case BPF_WORKQUEUE:
case BPF_LIST_NODE:
@@ -3948,6 +3962,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
return ERR_PTR(-ENOMEM);
rec->spin_lock_off = -EINVAL;
+ rec->res_spin_lock_off = -EINVAL;
rec->timer_off = -EINVAL;
rec->wq_off = -EINVAL;
rec->refcount_off = -EINVAL;
@@ -3975,6 +3990,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* Cache offset for faster lookup at runtime */
rec->spin_lock_off = rec->fields[i].offset;
break;
+ case BPF_RES_SPIN_LOCK:
+ WARN_ON_ONCE(rec->spin_lock_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->res_spin_lock_off = rec->fields[i].offset;
+ break;
case BPF_TIMER:
WARN_ON_ONCE(rec->timer_off >= 0);
/* Cache offset for faster lookup at runtime */
@@ -4018,9 +4038,15 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
rec->cnt++;
}
+ if (rec->spin_lock_off >= 0 && rec->res_spin_lock_off >= 0) {
+ ret = -EINVAL;
+ goto end;
+ }
+
/* bpf_{list_head, rb_node} require bpf_spin_lock */
if ((btf_record_has_field(rec, BPF_LIST_HEAD) ||
- btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) {
+ btf_record_has_field(rec, BPF_RB_ROOT)) &&
+ (rec->spin_lock_off < 0 && rec->res_spin_lock_off < 0)) {
ret = -EINVAL;
goto end;
}
@@ -4944,11 +4970,6 @@ static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_kflag(t)) {
- btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
- return -EINVAL;
- }
-
component_idx = btf_type_decl_tag(t)->component_idx;
if (component_idx < -1) {
btf_verifier_log_type(env, t, "Invalid component_idx");
@@ -5638,7 +5659,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
type = &tab->types[tab->cnt];
type->btf_id = i;
- record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
+ record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
BPF_KPTR, t->size);
/* The record cannot be unset, treat it as an error if so */
@@ -6507,6 +6528,8 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = {
/* rxrpc */
{ "rxrpc_recvdata", 0x1 },
{ "rxrpc_resend", 0x10 },
+ { "rxrpc_tq", 0x10 },
+ { "rxrpc_client", 0x1 },
/* skb */
{"kfree_skb", 0x1000},
/* sunrpc */
@@ -6529,6 +6552,103 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = {
{ "mr_integ_alloc", 0x2000 },
/* bpf_testmod */
{ "bpf_testmod_test_read", 0x0 },
+ /* amdgpu */
+ { "amdgpu_vm_bo_map", 0x1 },
+ { "amdgpu_vm_bo_unmap", 0x1 },
+ /* netfs */
+ { "netfs_folioq", 0x1 },
+ /* xfs from xfs_defer_pending_class */
+ { "xfs_defer_create_intent", 0x1 },
+ { "xfs_defer_cancel_list", 0x1 },
+ { "xfs_defer_pending_finish", 0x1 },
+ { "xfs_defer_pending_abort", 0x1 },
+ { "xfs_defer_relog_intent", 0x1 },
+ { "xfs_defer_isolate_paused", 0x1 },
+ { "xfs_defer_item_pause", 0x1 },
+ { "xfs_defer_item_unpause", 0x1 },
+ /* xfs from xfs_defer_pending_item_class */
+ { "xfs_defer_add_item", 0x1 },
+ { "xfs_defer_cancel_item", 0x1 },
+ { "xfs_defer_finish_item", 0x1 },
+ /* xfs from xfs_icwalk_class */
+ { "xfs_ioc_free_eofblocks", 0x10 },
+ { "xfs_blockgc_free_space", 0x10 },
+ /* xfs from xfs_btree_cur_class */
+ { "xfs_btree_updkeys", 0x100 },
+ { "xfs_btree_overlapped_query_range", 0x100 },
+ /* xfs from xfs_imap_class*/
+ { "xfs_map_blocks_found", 0x10000 },
+ { "xfs_map_blocks_alloc", 0x10000 },
+ { "xfs_iomap_alloc", 0x1000 },
+ { "xfs_iomap_found", 0x1000 },
+ /* xfs from xfs_fs_class */
+ { "xfs_inodegc_flush", 0x1 },
+ { "xfs_inodegc_push", 0x1 },
+ { "xfs_inodegc_start", 0x1 },
+ { "xfs_inodegc_stop", 0x1 },
+ { "xfs_inodegc_queue", 0x1 },
+ { "xfs_inodegc_throttle", 0x1 },
+ { "xfs_fs_sync_fs", 0x1 },
+ { "xfs_blockgc_start", 0x1 },
+ { "xfs_blockgc_stop", 0x1 },
+ { "xfs_blockgc_worker", 0x1 },
+ { "xfs_blockgc_flush_all", 0x1 },
+ /* xfs_scrub */
+ { "xchk_nlinks_live_update", 0x10 },
+ /* xfs_scrub from xchk_metapath_class */
+ { "xchk_metapath_lookup", 0x100 },
+ /* nfsd */
+ { "nfsd_dirent", 0x1 },
+ { "nfsd_file_acquire", 0x1001 },
+ { "nfsd_file_insert_err", 0x1 },
+ { "nfsd_file_cons_err", 0x1 },
+ /* nfs4 */
+ { "nfs4_setup_sequence", 0x1 },
+ { "pnfs_update_layout", 0x10000 },
+ { "nfs4_inode_callback_event", 0x200 },
+ { "nfs4_inode_stateid_callback_event", 0x200 },
+ /* nfs from pnfs_layout_event */
+ { "pnfs_mds_fallback_pg_init_read", 0x10000 },
+ { "pnfs_mds_fallback_pg_init_write", 0x10000 },
+ { "pnfs_mds_fallback_pg_get_mirror_count", 0x10000 },
+ { "pnfs_mds_fallback_read_done", 0x10000 },
+ { "pnfs_mds_fallback_write_done", 0x10000 },
+ { "pnfs_mds_fallback_read_pagelist", 0x10000 },
+ { "pnfs_mds_fallback_write_pagelist", 0x10000 },
+ /* coda */
+ { "coda_dec_pic_run", 0x10 },
+ { "coda_dec_pic_done", 0x10 },
+ /* cfg80211 */
+ { "cfg80211_scan_done", 0x11 },
+ { "rdev_set_coalesce", 0x10 },
+ { "cfg80211_report_wowlan_wakeup", 0x100 },
+ { "cfg80211_inform_bss_frame", 0x100 },
+ { "cfg80211_michael_mic_failure", 0x10000 },
+ /* cfg80211 from wiphy_work_event */
+ { "wiphy_work_queue", 0x10 },
+ { "wiphy_work_run", 0x10 },
+ { "wiphy_work_cancel", 0x10 },
+ { "wiphy_work_flush", 0x10 },
+ /* hugetlbfs */
+ { "hugetlbfs_alloc_inode", 0x10 },
+ /* spufs */
+ { "spufs_context", 0x10 },
+ /* kvm_hv */
+ { "kvm_page_fault_enter", 0x100 },
+ /* dpu */
+ { "dpu_crtc_setup_mixer", 0x100 },
+ /* binder */
+ { "binder_transaction", 0x100 },
+ /* bcachefs */
+ { "btree_path_free", 0x100 },
+ /* hfi1_tx */
+ { "hfi1_sdma_progress", 0x1000 },
+ /* iptfs */
+ { "iptfs_ingress_postq_event", 0x1000 },
+ /* neigh */
+ { "neigh_update", 0x10 },
+ /* snd_firewire_lib */
+ { "amdtp_packet", 0x100 },
};
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
@@ -6679,6 +6799,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
info->reg_type = ctx_arg_info->reg_type;
info->btf = ctx_arg_info->btf ? : btf_vmlinux;
info->btf_id = ctx_arg_info->btf_id;
+ info->ref_obj_id = ctx_arg_info->ref_obj_id;
return true;
}
}
@@ -6745,7 +6866,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_type_tag(t)) {
+ if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) {
tag_value = __btf_name_by_offset(btf, t->name_off);
if (strcmp(tag_value, "user") == 0)
info->reg_type |= MEM_USER;
@@ -7004,7 +7125,7 @@ error:
/* check type tag */
t = btf_type_by_id(btf, mtype->type);
- if (btf_type_is_type_tag(t)) {
+ if (btf_type_is_type_tag(t) && !btf_type_kflag(t)) {
tag_value = __btf_name_by_offset(btf, t->name_off);
/* check __user tag */
if (strcmp(tag_value, "user") == 0)
@@ -8526,6 +8647,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_SOCK_OPS:
return BTF_KFUNC_HOOK_CGROUP;
case BPF_PROG_TYPE_SCHED_ACT:
return BTF_KFUNC_HOOK_SCHED_ACT;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 46e5db65dbc8..84f58f3d028a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -369,7 +369,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
/* count number of elements in the list.
* it's slow but the list cannot be long
*/
-static u32 prog_list_length(struct hlist_head *head)
+static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
{
struct bpf_prog_list *pl;
u32 cnt = 0;
@@ -377,6 +377,8 @@ static u32 prog_list_length(struct hlist_head *head)
hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
+ if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
+ (*preorder_cnt)++;
cnt++;
}
return cnt;
@@ -400,7 +402,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
if (flags & BPF_F_ALLOW_MULTI)
return true;
- cnt = prog_list_length(&p->bpf.progs[atype]);
+ cnt = prog_list_length(&p->bpf.progs[atype], NULL);
WARN_ON_ONCE(cnt > 1);
if (cnt == 1)
return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -423,12 +425,12 @@ static int compute_effective_progs(struct cgroup *cgrp,
struct bpf_prog_array *progs;
struct bpf_prog_list *pl;
struct cgroup *p = cgrp;
- int cnt = 0;
+ int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
/* count number of effective programs by walking parents */
do {
if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
- cnt += prog_list_length(&p->bpf.progs[atype]);
+ cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
p = cgroup_parent(p);
} while (p);
@@ -439,20 +441,34 @@ static int compute_effective_progs(struct cgroup *cgrp,
/* populate the array with effective progs */
cnt = 0;
p = cgrp;
+ fstart = preorder_cnt;
+ bstart = preorder_cnt - 1;
do {
if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
+ init_bstart = bstart;
hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
if (!prog_list_prog(pl))
continue;
- item = &progs->items[cnt];
+ if (pl->flags & BPF_F_PREORDER) {
+ item = &progs->items[bstart];
+ bstart--;
+ } else {
+ item = &progs->items[fstart];
+ fstart++;
+ }
item->prog = prog_list_prog(pl);
bpf_cgroup_storages_assign(item->cgroup_storage,
pl->storage);
cnt++;
}
+
+ /* reverse pre-ordering progs at this cgroup level */
+ for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
+ swap(progs->items[i], progs->items[j]);
+
} while ((p = cgroup_parent(p)));
*array = progs;
@@ -663,7 +679,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
*/
return -EPERM;
- if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+ if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS)
return -E2BIG;
pl = find_attach_entry(progs, prog, link, replace_prog,
@@ -698,6 +714,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
pl->prog = prog;
pl->link = link;
+ pl->flags = flags;
bpf_cgroup_storages_assign(pl->storage, storage);
cgrp->bpf.flags[atype] = saved_flags;
@@ -1073,7 +1090,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
lockdep_is_held(&cgroup_mutex));
total_cnt += bpf_prog_array_length(effective);
} else {
- total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
+ total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
}
}
@@ -1105,7 +1122,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
u32 id;
progs = &cgrp->bpf.progs[atype];
- cnt = min_t(int, prog_list_length(progs), total_cnt);
+ cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
i = 0;
hlist_for_each_entry(pl, progs, node) {
prog = prog_list_prog(pl);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index da729cbbaeb9..ba6b6118cf50 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1663,14 +1663,17 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(JMP, JSET, K), \
INSN_2(JMP, JA), \
INSN_2(JMP32, JA), \
+ /* Atomic operations. */ \
+ INSN_3(STX, ATOMIC, B), \
+ INSN_3(STX, ATOMIC, H), \
+ INSN_3(STX, ATOMIC, W), \
+ INSN_3(STX, ATOMIC, DW), \
/* Store instructions. */ \
/* Register based. */ \
INSN_3(STX, MEM, B), \
INSN_3(STX, MEM, H), \
INSN_3(STX, MEM, W), \
INSN_3(STX, MEM, DW), \
- INSN_3(STX, ATOMIC, W), \
- INSN_3(STX, ATOMIC, DW), \
/* Immediate based. */ \
INSN_3(ST, MEM, B), \
INSN_3(ST, MEM, H), \
@@ -2152,24 +2155,33 @@ out:
if (BPF_SIZE(insn->code) == BPF_W) \
atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
(DST + insn->off)); \
- else \
+ else if (BPF_SIZE(insn->code) == BPF_DW) \
atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
(DST + insn->off)); \
+ else \
+ goto default_label; \
break; \
case BOP | BPF_FETCH: \
if (BPF_SIZE(insn->code) == BPF_W) \
SRC = (u32) atomic_fetch_##KOP( \
(u32) SRC, \
(atomic_t *)(unsigned long) (DST + insn->off)); \
- else \
+ else if (BPF_SIZE(insn->code) == BPF_DW) \
SRC = (u64) atomic64_fetch_##KOP( \
(u64) SRC, \
(atomic64_t *)(unsigned long) (DST + insn->off)); \
+ else \
+ goto default_label; \
break;
STX_ATOMIC_DW:
STX_ATOMIC_W:
+ STX_ATOMIC_H:
+ STX_ATOMIC_B:
switch (IMM) {
+ /* Atomic read-modify-write instructions support only W and DW
+ * size modifiers.
+ */
ATOMIC_ALU_OP(BPF_ADD, add)
ATOMIC_ALU_OP(BPF_AND, and)
ATOMIC_ALU_OP(BPF_OR, or)
@@ -2181,20 +2193,63 @@ out:
SRC = (u32) atomic_xchg(
(atomic_t *)(unsigned long) (DST + insn->off),
(u32) SRC);
- else
+ else if (BPF_SIZE(insn->code) == BPF_DW)
SRC = (u64) atomic64_xchg(
(atomic64_t *)(unsigned long) (DST + insn->off),
(u64) SRC);
+ else
+ goto default_label;
break;
case BPF_CMPXCHG:
if (BPF_SIZE(insn->code) == BPF_W)
BPF_R0 = (u32) atomic_cmpxchg(
(atomic_t *)(unsigned long) (DST + insn->off),
(u32) BPF_R0, (u32) SRC);
- else
+ else if (BPF_SIZE(insn->code) == BPF_DW)
BPF_R0 = (u64) atomic64_cmpxchg(
(atomic64_t *)(unsigned long) (DST + insn->off),
(u64) BPF_R0, (u64) SRC);
+ else
+ goto default_label;
+ break;
+ /* Atomic load and store instructions support all size
+ * modifiers.
+ */
+ case BPF_LOAD_ACQ:
+ switch (BPF_SIZE(insn->code)) {
+#define LOAD_ACQUIRE(SIZEOP, SIZE) \
+ case BPF_##SIZEOP: \
+ DST = (SIZE)smp_load_acquire( \
+ (SIZE *)(unsigned long)(SRC + insn->off)); \
+ break;
+ LOAD_ACQUIRE(B, u8)
+ LOAD_ACQUIRE(H, u16)
+ LOAD_ACQUIRE(W, u32)
+#ifdef CONFIG_64BIT
+ LOAD_ACQUIRE(DW, u64)
+#endif
+#undef LOAD_ACQUIRE
+ default:
+ goto default_label;
+ }
+ break;
+ case BPF_STORE_REL:
+ switch (BPF_SIZE(insn->code)) {
+#define STORE_RELEASE(SIZEOP, SIZE) \
+ case BPF_##SIZEOP: \
+ smp_store_release( \
+ (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \
+ break;
+ STORE_RELEASE(B, u8)
+ STORE_RELEASE(H, u16)
+ STORE_RELEASE(W, u32)
+#ifdef CONFIG_64BIT
+ STORE_RELEASE(DW, u64)
+#endif
+#undef STORE_RELEASE
+ default:
+ goto default_label;
+ }
break;
default:
@@ -2290,17 +2345,18 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth)
insn->code = BPF_JMP | BPF_CALL_ARGS;
}
#endif
-#else
+#endif
+
static unsigned int __bpf_prog_ret0_warn(const void *ctx,
const struct bpf_insn *insn)
{
/* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
- * is not working properly, so warn about it!
+ * is not working properly, or interpreter is being used when
+ * prog->jit_requested is not 0, so warn about it!
*/
WARN_ON_ONCE(1);
return 0;
}
-#endif
bool bpf_prog_map_compatible(struct bpf_map *map,
const struct bpf_prog *fp)
@@ -2380,8 +2436,18 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+ u32 idx = (round_up(stack_depth, 32) / 32) - 1;
- fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
+ /* may_goto may cause stack size > 512, leading to idx out-of-bounds.
+ * But for non-JITed programs, we don't need bpf_func, so no bounds
+ * check needed.
+ */
+ if (!fp->jit_requested &&
+ !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) {
+ fp->bpf_func = interpreters[idx];
+ } else {
+ fp->bpf_func = __bpf_prog_ret0_warn;
+ }
#else
fp->bpf_func = __bpf_prog_ret0_warn;
#endif
@@ -2906,6 +2972,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
return NULL;
}
+const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void)
+{
+ return NULL;
+}
+
u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
@@ -3058,6 +3129,32 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp,
{
}
+bool __weak bpf_jit_supports_timed_may_goto(void)
+{
+ return false;
+}
+
+u64 __weak arch_bpf_timed_may_goto(void)
+{
+ return 0;
+}
+
+u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p)
+{
+ u64 time = ktime_get_mono_fast_ns();
+
+ /* Populate the timestamp for this stack frame, and refresh count. */
+ if (!p->timestamp) {
+ p->timestamp = time;
+ return BPF_MAX_TIMED_LOOPS;
+ }
+ /* Check if we've exhausted our time slice, and zero count. */
+ if (time - p->timestamp >= (NSEC_PER_SEC / 4))
+ return 0;
+ /* Refresh the count for the stack frame. */
+ return BPF_MAX_TIMED_LOOPS;
+}
+
/* for configs without MMU or 32-bit */
__weak const struct bpf_map_ops arena_map_ops;
__weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 774accbd4a22..67e8a2fc1a99 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -33,8 +33,8 @@
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>
-#include <linux/netdevice.h> /* netif_receive_skb_list */
-#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/netdevice.h>
+#include <net/gro.h>
/* General idea: XDP packets getting XDP redirected to another CPU,
* will maximum be stored/queued for one driver ->poll() call. It is
@@ -68,6 +68,7 @@ struct bpf_cpu_map_entry {
struct bpf_cpumap_val value;
struct bpf_prog *prog;
+ struct gro_node gro;
struct completion kthread_running;
struct rcu_work free_work;
@@ -133,22 +134,23 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
}
}
-static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
- struct list_head *listp,
- struct xdp_cpumap_stats *stats)
+static u32 cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
+ void **skbs, u32 skb_n,
+ struct xdp_cpumap_stats *stats)
{
- struct sk_buff *skb, *tmp;
struct xdp_buff xdp;
- u32 act;
+ u32 act, pass = 0;
int err;
- list_for_each_entry_safe(skb, tmp, listp, list) {
+ for (u32 i = 0; i < skb_n; i++) {
+ struct sk_buff *skb = skbs[i];
+
act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
switch (act) {
case XDP_PASS:
+ skbs[pass++] = skb;
break;
case XDP_REDIRECT:
- skb_list_del_init(skb);
err = xdp_do_generic_redirect(skb->dev, skb, &xdp,
rcpu->prog);
if (unlikely(err)) {
@@ -157,7 +159,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
} else {
stats->redirect++;
}
- return;
+ break;
default:
bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
@@ -165,12 +167,15 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
trace_xdp_exception(skb->dev, rcpu->prog, act);
fallthrough;
case XDP_DROP:
- skb_list_del_init(skb);
- kfree_skb(skb);
+ napi_consume_skb(skb, true);
stats->drop++;
- return;
+ break;
}
}
+
+ stats->pass += pass;
+
+ return pass;
}
static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
@@ -204,7 +209,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
stats->drop++;
} else {
frames[nframes++] = xdpf;
- stats->pass++;
}
break;
case XDP_REDIRECT:
@@ -228,43 +232,65 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
xdp_clear_return_frame_no_direct();
+ stats->pass += nframes;
return nframes;
}
#define CPUMAP_BATCH 8
-static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
- int xdp_n, struct xdp_cpumap_stats *stats,
- struct list_head *list)
+struct cpu_map_ret {
+ u32 xdp_n;
+ u32 skb_n;
+};
+
+static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
+ void **skbs, struct cpu_map_ret *ret,
+ struct xdp_cpumap_stats *stats)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
- int nframes;
if (!rcpu->prog)
- return xdp_n;
+ goto out;
- rcu_read_lock_bh();
+ rcu_read_lock();
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
- nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
+ ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats);
+ if (unlikely(ret->skb_n))
+ ret->skb_n = cpu_map_bpf_prog_run_skb(rcpu, skbs, ret->skb_n,
+ stats);
if (stats->redirect)
xdp_do_flush();
- if (unlikely(!list_empty(list)))
- cpu_map_bpf_prog_run_skb(rcpu, list, stats);
-
bpf_net_ctx_clear(bpf_net_ctx);
- rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
+ rcu_read_unlock();
- return nframes;
+out:
+ if (unlikely(ret->skb_n) && ret->xdp_n)
+ memmove(&skbs[ret->xdp_n], skbs, ret->skb_n * sizeof(*skbs));
+}
+
+static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty)
+{
+ /*
+ * If the ring is not empty, there'll be a new iteration soon, and we
+ * only need to do a full flush if a tick is long (> 1 ms).
+ * If the ring is empty, to not hold GRO packets in the stack for too
+ * long, do a full flush.
+ * This is equivalent to how NAPI decides whether to perform a full
+ * flush.
+ */
+ gro_flush(&rcpu->gro, !empty && HZ >= 1000);
+ gro_normal_list(&rcpu->gro);
}
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
unsigned long last_qs = jiffies;
+ u32 packets = 0;
complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
@@ -277,11 +303,11 @@ static int cpu_map_kthread_run(void *data)
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
struct xdp_cpumap_stats stats = {}; /* zero stats */
unsigned int kmem_alloc_drops = 0, sched = 0;
- gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
- int i, n, m, nframes, xdp_n;
+ struct cpu_map_ret ret = { };
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
- LIST_HEAD(list);
+ u32 i, n, m;
+ bool empty;
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -306,7 +332,7 @@ static int cpu_map_kthread_run(void *data)
*/
n = __ptr_ring_consume_batched(rcpu->queue, frames,
CPUMAP_BATCH);
- for (i = 0, xdp_n = 0; i < n; i++) {
+ for (i = 0; i < n; i++) {
void *f = frames[i];
struct page *page;
@@ -314,11 +340,11 @@ static int cpu_map_kthread_run(void *data)
struct sk_buff *skb = f;
__ptr_clear_bit(0, &skb);
- list_add_tail(&skb->list, &list);
+ skbs[ret.skb_n++] = skb;
continue;
}
- frames[xdp_n++] = f;
+ frames[ret.xdp_n++] = f;
page = virt_to_page(f);
/* Bring struct page memory area to curr CPU. Read by
@@ -328,40 +354,51 @@ static int cpu_map_kthread_run(void *data)
prefetchw(page);
}
+ local_bh_disable();
+
/* Support running another XDP prog on this CPU */
- nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
- if (nframes) {
- m = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
- gfp, nframes, skbs);
- if (unlikely(m == 0)) {
- for (i = 0; i < nframes; i++)
- skbs[i] = NULL; /* effect: xdp_return_frame */
- kmem_alloc_drops += nframes;
- }
+ cpu_map_bpf_prog_run(rcpu, frames, skbs, &ret, &stats);
+ if (!ret.xdp_n)
+ goto stats;
+
+ m = napi_skb_cache_get_bulk(skbs, ret.xdp_n);
+ if (unlikely(m < ret.xdp_n)) {
+ for (i = m; i < ret.xdp_n; i++)
+ xdp_return_frame(frames[i]);
+
+ if (ret.skb_n)
+ memmove(&skbs[m], &skbs[ret.xdp_n],
+ ret.skb_n * sizeof(*skbs));
+
+ kmem_alloc_drops += ret.xdp_n - m;
+ ret.xdp_n = m;
}
- local_bh_disable();
- for (i = 0; i < nframes; i++) {
+ for (i = 0; i < ret.xdp_n; i++) {
struct xdp_frame *xdpf = frames[i];
- struct sk_buff *skb = skbs[i];
- skb = __xdp_build_skb_from_frame(xdpf, skb,
- xdpf->dev_rx);
- if (!skb) {
- xdp_return_frame(xdpf);
- continue;
- }
-
- list_add_tail(&skb->list, &list);
+ /* Can fail only when !skb -- already handled above */
+ __xdp_build_skb_from_frame(xdpf, skbs[i], xdpf->dev_rx);
}
+stats:
/* Feedback loop via tracepoint.
* NB: keep before recv to allow measuring enqueue/dequeue latency.
*/
trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops,
sched, &stats);
- netif_receive_skb_list(&list);
+ for (i = 0; i < ret.xdp_n + ret.skb_n; i++)
+ gro_receive_skb(&rcpu->gro, skbs[i]);
+
+ /* Flush either every 64 packets or in case of empty ring */
+ packets += n;
+ empty = __ptr_ring_empty(rcpu->queue);
+ if (packets >= NAPI_POLL_WEIGHT || empty) {
+ cpu_map_gro_flush(rcpu, empty);
+ packets = 0;
+ }
+
local_bh_enable(); /* resched point, may call do_softirq() */
}
__set_current_state(TASK_RUNNING);
@@ -430,6 +467,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
rcpu->cpu = cpu;
rcpu->map_id = map->id;
rcpu->value.qsize = value->qsize;
+ gro_init(&rcpu->gro);
if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
goto free_ptr_ring;
@@ -458,6 +496,7 @@ free_prog:
if (rcpu->prog)
bpf_prog_put(rcpu->prog);
free_ptr_ring:
+ gro_cleanup(&rcpu->gro);
ptr_ring_cleanup(rcpu->queue, NULL);
free_queue:
kfree(rcpu->queue);
@@ -487,6 +526,7 @@ static void __cpu_map_entry_free(struct work_struct *work)
if (rcpu->prog)
bpf_prog_put(rcpu->prog);
+ gro_cleanup(&rcpu->gro);
/* The queue should be empty at this point */
__cpu_map_ring_cleanup(rcpu->queue);
ptr_ring_cleanup(rcpu->queue, NULL);
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index cfa1c18e3a48..9876c5fe6c2a 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -45,6 +45,10 @@ __bpf_kfunc_start_defs();
*
* bpf_cpumask_create() allocates memory using the BPF memory allocator, and
* will not block. It may return NULL if no memory is available.
+ *
+ * Return:
+ * * A pointer to a new struct bpf_cpumask instance on success.
+ * * NULL if the BPF memory allocator is out of memory.
*/
__bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
{
@@ -71,6 +75,10 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
* Acquires a reference to a BPF cpumask. The cpumask returned by this function
* must either be embedded in a map as a kptr, or freed with
* bpf_cpumask_release().
+ *
+ * Return:
+ * * The struct bpf_cpumask pointer passed to the function.
+ *
*/
__bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
{
@@ -106,6 +114,9 @@ CFI_NOSEAL(bpf_cpumask_release_dtor);
*
* Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask
* pointer may be safely passed to this function.
+ *
+ * Return:
+ * * The index of the first nonzero bit in the struct cpumask.
*/
__bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask)
{
@@ -119,6 +130,9 @@ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask)
*
* Find the index of the first unset bit of the cpumask. A struct bpf_cpumask
* pointer may be safely passed to this function.
+ *
+ * Return:
+ * * The index of the first zero bit in the struct cpumask.
*/
__bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
{
@@ -133,6 +147,9 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask)
*
* Find the index of the first nonzero bit of the AND of two cpumasks.
* struct bpf_cpumask pointers may be safely passed to @src1 and @src2.
+ *
+ * Return:
+ * * The index of the first bit that is nonzero in both cpumask instances.
*/
__bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1,
const struct cpumask *src2)
@@ -414,12 +431,47 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
* @cpumask: The cpumask being queried.
*
* Count the number of set bits in the given cpumask.
+ *
+ * Return:
+ * * The number of bits set in the mask.
*/
__bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask)
{
return cpumask_weight(cpumask);
}
+/**
+ * bpf_cpumask_populate() - Populate the CPU mask from the contents of
+ * a BPF memory region.
+ *
+ * @cpumask: The cpumask being populated.
+ * @src: The BPF memory holding the bit pattern.
+ * @src__sz: Length of the BPF memory region in bytes.
+ *
+ * Return:
+ * * 0 if the struct cpumask * instance was populated successfully.
+ * * -EACCES if the memory region is too small to populate the cpumask.
+ * * -EINVAL if the memory region is not aligned to the size of a long
+ * and the architecture does not support efficient unaligned accesses.
+ */
+__bpf_kfunc int bpf_cpumask_populate(struct cpumask *cpumask, void *src, size_t src__sz)
+{
+ unsigned long source = (unsigned long)src;
+
+ /* The memory region must be large enough to populate the entire CPU mask. */
+ if (src__sz < bitmap_size(nr_cpu_ids))
+ return -EACCES;
+
+ /* If avoiding unaligned accesses, the input region must be aligned to the nearest long. */
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+ !IS_ALIGNED(source, sizeof(long)))
+ return -EINVAL;
+
+ bitmap_copy(cpumask_bits(cpumask), src, nr_cpu_ids);
+
+ return 0;
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(cpumask_kfunc_btf_ids)
@@ -448,6 +500,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_populate, KF_RCU)
BTF_KFUNCS_END(cpumask_kfunc_btf_ids)
static const struct btf_kfunc_id_set cpumask_kfunc_set = {
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index 309c4aa1b026..20883c6b1546 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -202,7 +202,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
insn->dst_reg, class == BPF_ALU ? 'w' : 'r',
insn->dst_reg);
} else if (is_addr_space_cast(insn)) {
- verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n",
+ verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %u, %u)\n",
insn->code, insn->dst_reg,
insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
} else if (is_mov_percpu_addr(insn)) {
@@ -267,6 +267,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg, insn->off, insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_LOAD_ACQ) {
+ verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_STORE_REL) {
+ verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off, insn->src_reg);
} else {
verbose(cbs->private_data, "BUG_%02x\n", insn->code);
}
@@ -369,7 +381,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
insn->code, class == BPF_JMP32 ? 'w' : 'r',
insn->dst_reg,
bpf_jmp_string[BPF_OP(insn->code) >> 4],
- insn->imm, insn->off);
+ (u32)insn->imm, insn->off);
}
} else {
verbose(cbs->private_data, "(%02x) %s\n",
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4a9eeb7aef85..5a5adc66b8e2 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -16,6 +16,7 @@
#include "bpf_lru_list.h"
#include "map_in_map.h"
#include <linux/bpf_mem_alloc.h>
+#include <asm/rqspinlock.h>
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
@@ -78,7 +79,7 @@
*/
struct bucket {
struct hlist_nulls_head head;
- raw_spinlock_t raw_lock;
+ rqspinlock_t raw_lock;
};
#define HASHTAB_MAP_LOCK_COUNT 8
@@ -104,8 +105,6 @@ struct bpf_htab {
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
u32 hashrnd;
- struct lock_class_key lockdep_key;
- int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
};
/* each htab element is struct htab_elem + key + value */
@@ -140,45 +139,26 @@ static void htab_init_buckets(struct bpf_htab *htab)
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
- raw_spin_lock_init(&htab->buckets[i].raw_lock);
- lockdep_set_class(&htab->buckets[i].raw_lock,
- &htab->lockdep_key);
+ raw_res_spin_lock_init(&htab->buckets[i].raw_lock);
cond_resched();
}
}
-static inline int htab_lock_bucket(const struct bpf_htab *htab,
- struct bucket *b, u32 hash,
- unsigned long *pflags)
+static inline int htab_lock_bucket(struct bucket *b, unsigned long *pflags)
{
unsigned long flags;
+ int ret;
- hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
-
- preempt_disable();
- local_irq_save(flags);
- if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
- __this_cpu_dec(*(htab->map_locked[hash]));
- local_irq_restore(flags);
- preempt_enable();
- return -EBUSY;
- }
-
- raw_spin_lock(&b->raw_lock);
+ ret = raw_res_spin_lock_irqsave(&b->raw_lock, flags);
+ if (ret)
+ return ret;
*pflags = flags;
-
return 0;
}
-static inline void htab_unlock_bucket(const struct bpf_htab *htab,
- struct bucket *b, u32 hash,
- unsigned long flags)
+static inline void htab_unlock_bucket(struct bucket *b, unsigned long flags)
{
- hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);
- raw_spin_unlock(&b->raw_lock);
- __this_cpu_dec(*(htab->map_locked[hash]));
- local_irq_restore(flags);
- preempt_enable();
+ raw_res_spin_unlock_irqrestore(&b->raw_lock, flags);
}
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
@@ -198,12 +178,12 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
- *(void __percpu **)(l->key + key_size) = pptr;
+ *(void __percpu **)(l->key + roundup(key_size, 8)) = pptr;
}
static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
{
- return *(void __percpu **)(l->key + key_size);
+ return *(void __percpu **)(l->key + roundup(key_size, 8));
}
static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
@@ -483,14 +463,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab;
- int err, i;
+ int err;
htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
if (!htab)
return ERR_PTR(-ENOMEM);
- lockdep_register_key(&htab->lockdep_key);
-
bpf_map_init_from_attr(&htab->map, attr);
if (percpu_lru) {
@@ -536,15 +514,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (!htab->buckets)
goto free_elem_count;
- for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
- htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
- sizeof(int),
- sizeof(int),
- GFP_USER);
- if (!htab->map_locked[i])
- goto free_map_locked;
- }
-
if (htab->map.map_flags & BPF_F_ZERO_SEED)
htab->hashrnd = 0;
else
@@ -607,15 +576,12 @@ free_prealloc:
free_map_locked:
if (htab->use_percpu_counter)
percpu_counter_destroy(&htab->pcount);
- for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
- free_percpu(htab->map_locked[i]);
bpf_map_area_free(htab->buckets);
bpf_mem_alloc_destroy(&htab->pcpu_ma);
bpf_mem_alloc_destroy(&htab->ma);
free_elem_count:
bpf_map_free_elem_count(&htab->map);
free_htab:
- lockdep_unregister_key(&htab->lockdep_key);
bpf_map_area_free(htab);
return ERR_PTR(err);
}
@@ -787,6 +753,9 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
static void check_and_free_fields(struct bpf_htab *htab,
struct htab_elem *elem)
{
+ if (IS_ERR_OR_NULL(htab->map.record))
+ return;
+
if (htab_is_percpu(htab)) {
void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
int cpu;
@@ -817,7 +786,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
b = __select_bucket(htab, tgt_l->hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return false;
@@ -828,7 +797,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
break;
}
- htab_unlock_bucket(htab, b, tgt_l->hash, flags);
+ htab_unlock_bucket(b, flags);
if (l == tgt_l)
check_and_free_fields(htab, l);
@@ -1147,7 +1116,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
*/
}
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return ret;
@@ -1198,7 +1167,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
check_and_free_fields(htab, l_old);
}
}
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
if (l_old) {
if (old_map_ptr)
map->ops->map_fd_put_ptr(map, old_map_ptr, true);
@@ -1207,7 +1176,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
}
return 0;
err:
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
return ret;
}
@@ -1254,7 +1223,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
copy_map_value(&htab->map,
l_new->key + round_up(map->key_size, 8), value);
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
goto err_lock_bucket;
@@ -1275,7 +1244,7 @@ static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value
ret = 0;
err:
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
err_lock_bucket:
if (ret)
@@ -1312,7 +1281,7 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
b = __select_bucket(htab, hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return ret;
@@ -1337,7 +1306,7 @@ static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
}
ret = 0;
err:
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
return ret;
}
@@ -1378,7 +1347,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
return -ENOMEM;
}
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
goto err_lock_bucket;
@@ -1402,7 +1371,7 @@ static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
}
ret = 0;
err:
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
err_lock_bucket:
if (l_new) {
bpf_map_dec_elem_count(&htab->map);
@@ -1444,7 +1413,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
b = __select_bucket(htab, hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return ret;
@@ -1454,7 +1423,7 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
else
ret = -ENOENT;
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
if (l)
free_htab_elem(htab, l);
@@ -1480,7 +1449,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
b = __select_bucket(htab, hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, hash, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret)
return ret;
@@ -1491,7 +1460,7 @@ static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
else
ret = -ENOENT;
- htab_unlock_bucket(htab, b, hash, flags);
+ htab_unlock_bucket(b, flags);
if (l)
htab_lru_push_free(htab, l);
return ret;
@@ -1558,7 +1527,6 @@ static void htab_map_free_timers_and_wq(struct bpf_map *map)
static void htab_map_free(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- int i;
/* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
* bpf_free_used_maps() is called after bpf prog is no longer executing.
@@ -1583,9 +1551,6 @@ static void htab_map_free(struct bpf_map *map)
bpf_mem_alloc_destroy(&htab->ma);
if (htab->use_percpu_counter)
percpu_counter_destroy(&htab->pcount);
- for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
- free_percpu(htab->map_locked[i]);
- lockdep_unregister_key(&htab->lockdep_key);
bpf_map_area_free(htab);
}
@@ -1628,7 +1593,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
b = __select_bucket(htab, hash);
head = &b->head;
- ret = htab_lock_bucket(htab, b, hash, &bflags);
+ ret = htab_lock_bucket(b, &bflags);
if (ret)
return ret;
@@ -1665,7 +1630,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
hlist_nulls_del_rcu(&l->hash_node);
out_unlock:
- htab_unlock_bucket(htab, b, hash, bflags);
+ htab_unlock_bucket(b, bflags);
if (l) {
if (is_lru_map)
@@ -1787,7 +1752,7 @@ again_nocopy:
head = &b->head;
/* do not grab the lock unless need it (bucket_cnt > 0). */
if (locked) {
- ret = htab_lock_bucket(htab, b, batch, &flags);
+ ret = htab_lock_bucket(b, &flags);
if (ret) {
rcu_read_unlock();
bpf_enable_instrumentation();
@@ -1810,7 +1775,7 @@ again_nocopy:
/* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it.
*/
- htab_unlock_bucket(htab, b, batch, flags);
+ htab_unlock_bucket(b, flags);
rcu_read_unlock();
bpf_enable_instrumentation();
goto after_loop;
@@ -1821,7 +1786,7 @@ again_nocopy:
/* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it.
*/
- htab_unlock_bucket(htab, b, batch, flags);
+ htab_unlock_bucket(b, flags);
rcu_read_unlock();
bpf_enable_instrumentation();
kvfree(keys);
@@ -1884,7 +1849,7 @@ again_nocopy:
dst_val += value_size;
}
- htab_unlock_bucket(htab, b, batch, flags);
+ htab_unlock_bucket(b, flags);
locked = false;
while (node_to_free) {
@@ -2354,7 +2319,7 @@ static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
- offsetof(struct htab_elem, key) + map->key_size);
+ offsetof(struct htab_elem, key) + roundup(map->key_size, 8));
*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
*insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f27ce162427a..e3a2662f4e33 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1284,8 +1284,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
atomic_set(&t->cancelling, 0);
INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
- hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
- t->timer.function = bpf_timer_cb;
+ hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT);
cb->value = (void *)async - map->record->timer_off;
break;
case BPF_ASYNC_TYPE_WQ:
@@ -1759,8 +1758,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE,
};
-BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
- u32, offset, u64, flags)
+static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src,
+ u32 offset, u64 flags)
{
enum bpf_dynptr_type type;
int err;
@@ -1793,6 +1792,12 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
}
}
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
+ u32, offset, u64, flags)
+{
+ return __bpf_dynptr_read(dst, len, src, offset, flags);
+}
+
static const struct bpf_func_proto bpf_dynptr_read_proto = {
.func = bpf_dynptr_read,
.gpl_only = false,
@@ -1804,8 +1809,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
.arg5_type = ARG_ANYTHING,
};
-BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
- u32, len, u64, flags)
+static int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src,
+ u32 len, u64 flags)
{
enum bpf_dynptr_type type;
int err;
@@ -1843,6 +1848,12 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
}
}
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+ u32, len, u64, flags)
+{
+ return __bpf_dynptr_write(dst, offset, src, len, flags);
+}
+
static const struct bpf_func_proto bpf_dynptr_write_proto = {
.func = bpf_dynptr_write,
.gpl_only = false,
@@ -2044,6 +2055,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_task_pt_regs_proto;
case BPF_FUNC_trace_vprintk:
return bpf_get_trace_vprintk_proto();
+ case BPF_FUNC_perf_event_read_value:
+ return bpf_get_perf_event_read_value_proto();
default:
return NULL;
}
@@ -2758,6 +2771,61 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
return 0;
}
+/**
+ * bpf_dynptr_copy() - Copy data from one dynptr to another.
+ * @dst_ptr: Destination dynptr - where data should be copied to
+ * @dst_off: Offset into the destination dynptr
+ * @src_ptr: Source dynptr - where data should be copied from
+ * @src_off: Offset into the source dynptr
+ * @size: Length of the data to copy from source to destination
+ *
+ * Copies data from source dynptr to destination dynptr.
+ * Returns 0 on success; negative error, otherwise.
+ */
+__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off,
+ struct bpf_dynptr *src_ptr, u32 src_off, u32 size)
+{
+ struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr;
+ struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr;
+ void *src_slice, *dst_slice;
+ char buf[256];
+ u32 off;
+
+ src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size);
+ dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size);
+
+ if (src_slice && dst_slice) {
+ memmove(dst_slice, src_slice, size);
+ return 0;
+ }
+
+ if (src_slice)
+ return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0);
+
+ if (dst_slice)
+ return __bpf_dynptr_read(dst_slice, size, src, src_off, 0);
+
+ if (bpf_dynptr_check_off_len(dst, dst_off, size) ||
+ bpf_dynptr_check_off_len(src, src_off, size))
+ return -E2BIG;
+
+ off = 0;
+ while (off < size) {
+ u32 chunk_sz = min_t(u32, sizeof(buf), size - off);
+ int err;
+
+ err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0);
+ if (err)
+ return err;
+ err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0);
+ if (err)
+ return err;
+
+ off += chunk_sz;
+ }
+ return 0;
+}
+
__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
{
return obj;
@@ -3067,6 +3135,50 @@ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user
return ret + 1;
}
+/**
+ * bpf_copy_from_user_task_str() - Copy a string from an task's address space
+ * @dst: Destination address, in kernel space. This buffer must be
+ * at least @dst__sz bytes long.
+ * @dst__sz: Maximum number of bytes to copy, includes the trailing NUL.
+ * @unsafe_ptr__ign: Source address in the task's address space.
+ * @tsk: The task whose address space will be used
+ * @flags: The only supported flag is BPF_F_PAD_ZEROS
+ *
+ * Copies a NUL terminated string from a task's address space to @dst__sz
+ * buffer. If user string is too long this will still ensure zero termination
+ * in the @dst__sz buffer unless buffer size is 0.
+ *
+ * If BPF_F_PAD_ZEROS flag is set, memset the tail of @dst__sz to 0 on success
+ * and memset all of @dst__sz on failure.
+ *
+ * Return: The number of copied bytes on success including the NUL terminator.
+ * A negative error code on failure.
+ */
+__bpf_kfunc int bpf_copy_from_user_task_str(void *dst, u32 dst__sz,
+ const void __user *unsafe_ptr__ign,
+ struct task_struct *tsk, u64 flags)
+{
+ int ret;
+
+ if (unlikely(flags & ~BPF_F_PAD_ZEROS))
+ return -EINVAL;
+
+ if (unlikely(dst__sz == 0))
+ return 0;
+
+ ret = copy_remote_vm_str(tsk, (unsigned long)unsafe_ptr__ign, dst, dst__sz, 0);
+ if (ret < 0) {
+ if (flags & BPF_F_PAD_ZEROS)
+ memset(dst, 0, dst__sz);
+ return ret;
+ }
+
+ if (flags & BPF_F_PAD_ZEROS)
+ memset(dst + ret, 0, dst__sz - ret);
+
+ return ret + 1;
+}
+
/* Keep unsinged long in prototype so that kfunc is usable when emitted to
* vmlinux.h in BPF programs directly, but note that while in BPF prog, the
* unsigned long always points to 8-byte region on stack, the kernel may only
@@ -3162,6 +3274,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null)
BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
BTF_ID_FLAGS(func, bpf_dynptr_size)
BTF_ID_FLAGS(func, bpf_dynptr_clone)
+BTF_ID_FLAGS(func, bpf_dynptr_copy)
#ifdef CONFIG_NET
BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
#endif
@@ -3174,6 +3287,7 @@ BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_copy_from_user_task_str, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_get_kmem_cache)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9aaf5124648b..dc3aa91a6ba0 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,14 +150,14 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
-static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
inode->i_op = &bpf_dir_iops;
inode->i_fop = &simple_dir_operations;
@@ -166,7 +166,7 @@ static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inc_nlink(dir);
bpf_dentry_finalize(dentry, inode, dir);
- return 0;
+ return NULL;
}
struct map_iter {
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e8a772e64324..be66d7e520e0 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -15,6 +15,7 @@
#include <net/ipv6.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
+#include <asm/rqspinlock.h>
#include <linux/bpf_mem_alloc.h>
/* Intermediate node */
@@ -36,7 +37,7 @@ struct lpm_trie {
size_t n_entries;
size_t max_prefixlen;
size_t data_size;
- raw_spinlock_t lock;
+ rqspinlock_t lock;
};
/* This trie implements a longest prefix match algorithm that can be used to
@@ -342,7 +343,9 @@ static long trie_update_elem(struct bpf_map *map,
if (!new_node)
return -ENOMEM;
- raw_spin_lock_irqsave(&trie->lock, irq_flags);
+ ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags);
+ if (ret)
+ goto out_free;
new_node->prefixlen = key->prefixlen;
RCU_INIT_POINTER(new_node->child[0], NULL);
@@ -356,8 +359,7 @@ static long trie_update_elem(struct bpf_map *map,
*/
slot = &trie->root;
- while ((node = rcu_dereference_protected(*slot,
- lockdep_is_held(&trie->lock)))) {
+ while ((node = rcu_dereference(*slot))) {
matchlen = longest_prefix_match(trie, node, key);
if (node->prefixlen != matchlen ||
@@ -442,8 +444,8 @@ static long trie_update_elem(struct bpf_map *map,
rcu_assign_pointer(*slot, im_node);
out:
- raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
-
+ raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags);
+out_free:
if (ret)
bpf_mem_cache_free(&trie->ma, new_node);
bpf_mem_cache_free_rcu(&trie->ma, free_node);
@@ -467,7 +469,9 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
- raw_spin_lock_irqsave(&trie->lock, irq_flags);
+ ret = raw_res_spin_lock_irqsave(&trie->lock, irq_flags);
+ if (ret)
+ return ret;
/* Walk the tree looking for an exact key/length match and keeping
* track of the path we traverse. We will need to know the node
@@ -478,8 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
trim = &trie->root;
trim2 = trim;
parent = NULL;
- while ((node = rcu_dereference_protected(
- *trim, lockdep_is_held(&trie->lock)))) {
+ while ((node = rcu_dereference(*trim))) {
matchlen = longest_prefix_match(trie, node, key);
if (node->prefixlen != matchlen ||
@@ -543,7 +546,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
free_node = node;
out:
- raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+ raw_res_spin_unlock_irqrestore(&trie->lock, irq_flags);
bpf_mem_cache_free_rcu(&trie->ma, free_parent);
bpf_mem_cache_free_rcu(&trie->ma, free_node);
@@ -592,7 +595,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
offsetof(struct bpf_lpm_trie_key_u8, data);
trie->max_prefixlen = trie->data_size * 8;
- raw_spin_lock_init(&trie->lock);
+ raw_res_spin_lock_init(&trie->lock);
/* Allocate intermediate and leaf nodes from the same allocator */
leaf_size = sizeof(struct lpm_trie_node) + trie->data_size +
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 1a4fec330eaa..42ae8d595c2c 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -25,6 +25,7 @@
#include <linux/rhashtable.h>
#include <linux/rtnetlink.h>
#include <linux/rwsem.h>
+#include <net/netdev_lock.h>
#include <net/xdp.h>
/* Protects offdevs, members of bpf_offload_netdev and offload members
@@ -528,13 +529,14 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&offmap->map, attr);
-
rtnl_lock();
- down_write(&bpf_devs_lock);
offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
err = bpf_dev_offload_check(offmap->netdev);
if (err)
- goto err_unlock;
+ goto err_unlock_rtnl;
+
+ netdev_lock_ops(offmap->netdev);
+ down_write(&bpf_devs_lock);
ondev = bpf_offload_find_netdev(offmap->netdev);
if (!ondev) {
@@ -548,12 +550,15 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
list_add_tail(&offmap->offloads, &ondev->maps);
up_write(&bpf_devs_lock);
+ netdev_unlock_ops(offmap->netdev);
rtnl_unlock();
return &offmap->map;
err_unlock:
up_write(&bpf_devs_lock);
+ netdev_unlock_ops(offmap->netdev);
+err_unlock_rtnl:
rtnl_unlock();
bpf_map_area_free(offmap);
return ERR_PTR(err);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 034cf87b54e9..632762b57299 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -14,11 +14,9 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
for_each_possible_cpu(cpu) {
struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu);
- raw_spin_lock_init(&head->lock);
+ raw_res_spin_lock_init(&head->lock);
head->first = NULL;
}
- raw_spin_lock_init(&s->extralist.lock);
- s->extralist.first = NULL;
return 0;
}
@@ -34,58 +32,39 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
WRITE_ONCE(head->first, node);
}
-static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
+static inline bool ___pcpu_freelist_push(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node)
{
- raw_spin_lock(&head->lock);
- pcpu_freelist_push_node(head, node);
- raw_spin_unlock(&head->lock);
-}
-
-static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
- struct pcpu_freelist_node *node)
-{
- if (!raw_spin_trylock(&s->extralist.lock))
+ if (raw_res_spin_lock(&head->lock))
return false;
-
- pcpu_freelist_push_node(&s->extralist, node);
- raw_spin_unlock(&s->extralist.lock);
+ pcpu_freelist_push_node(head, node);
+ raw_res_spin_unlock(&head->lock);
return true;
}
-static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
- struct pcpu_freelist_node *node)
+void __pcpu_freelist_push(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
{
- int cpu, orig_cpu;
+ struct pcpu_freelist_head *head;
+ int cpu;
- orig_cpu = raw_smp_processor_id();
- while (1) {
- for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) {
- struct pcpu_freelist_head *head;
+ if (___pcpu_freelist_push(this_cpu_ptr(s->freelist), node))
+ return;
+ while (true) {
+ for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
+ if (cpu == raw_smp_processor_id())
+ continue;
head = per_cpu_ptr(s->freelist, cpu);
- if (raw_spin_trylock(&head->lock)) {
- pcpu_freelist_push_node(head, node);
- raw_spin_unlock(&head->lock);
- return;
- }
- }
-
- /* cannot lock any per cpu lock, try extralist */
- if (pcpu_freelist_try_push_extra(s, node))
+ if (raw_res_spin_lock(&head->lock))
+ continue;
+ pcpu_freelist_push_node(head, node);
+ raw_res_spin_unlock(&head->lock);
return;
+ }
}
}
-void __pcpu_freelist_push(struct pcpu_freelist *s,
- struct pcpu_freelist_node *node)
-{
- if (in_nmi())
- ___pcpu_freelist_push_nmi(s, node);
- else
- ___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
-}
-
void pcpu_freelist_push(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
@@ -120,71 +99,29 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
{
+ struct pcpu_freelist_node *node = NULL;
struct pcpu_freelist_head *head;
- struct pcpu_freelist_node *node;
int cpu;
for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
head = per_cpu_ptr(s->freelist, cpu);
if (!READ_ONCE(head->first))
continue;
- raw_spin_lock(&head->lock);
+ if (raw_res_spin_lock(&head->lock))
+ continue;
node = head->first;
if (node) {
WRITE_ONCE(head->first, node->next);
- raw_spin_unlock(&head->lock);
+ raw_res_spin_unlock(&head->lock);
return node;
}
- raw_spin_unlock(&head->lock);
+ raw_res_spin_unlock(&head->lock);
}
-
- /* per cpu lists are all empty, try extralist */
- if (!READ_ONCE(s->extralist.first))
- return NULL;
- raw_spin_lock(&s->extralist.lock);
- node = s->extralist.first;
- if (node)
- WRITE_ONCE(s->extralist.first, node->next);
- raw_spin_unlock(&s->extralist.lock);
- return node;
-}
-
-static struct pcpu_freelist_node *
-___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
-{
- struct pcpu_freelist_head *head;
- struct pcpu_freelist_node *node;
- int cpu;
-
- for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
- head = per_cpu_ptr(s->freelist, cpu);
- if (!READ_ONCE(head->first))
- continue;
- if (raw_spin_trylock(&head->lock)) {
- node = head->first;
- if (node) {
- WRITE_ONCE(head->first, node->next);
- raw_spin_unlock(&head->lock);
- return node;
- }
- raw_spin_unlock(&head->lock);
- }
- }
-
- /* cannot pop from per cpu lists, try extralist */
- if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock))
- return NULL;
- node = s->extralist.first;
- if (node)
- WRITE_ONCE(s->extralist.first, node->next);
- raw_spin_unlock(&s->extralist.lock);
return node;
}
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
{
- if (in_nmi())
- return ___pcpu_freelist_pop_nmi(s);
return ___pcpu_freelist_pop(s);
}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3c76553cfe57..914798b74967 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -5,15 +5,15 @@
#define __PERCPU_FREELIST_H__
#include <linux/spinlock.h>
#include <linux/percpu.h>
+#include <asm/rqspinlock.h>
struct pcpu_freelist_head {
struct pcpu_freelist_node *first;
- raw_spinlock_t lock;
+ rqspinlock_t lock;
};
struct pcpu_freelist {
struct pcpu_freelist_head __percpu *freelist;
- struct pcpu_freelist_head extralist;
};
struct pcpu_freelist_node {
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 0c63bc2cd895..2fdf3c978db1 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -90,3 +90,4 @@ static void __exit fini(void)
late_initcall(load);
module_exit(fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs");
diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c
new file mode 100644
index 000000000000..b896c4a75a5c
--- /dev/null
+++ b/kernel/bpf/rqspinlock.c
@@ -0,0 +1,737 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Resilient Queued Spin Lock
+ *
+ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2013-2014,2018 Red Hat, Inc.
+ * (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
+ * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ * Peter Zijlstra <peterz@infradead.org>
+ * Kumar Kartikeya Dwivedi <memxor@gmail.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <linux/prefetch.h>
+#include <asm/byteorder.h>
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#endif
+#include <trace/events/lock.h>
+#include <asm/rqspinlock.h>
+#include <linux/timekeeping.h>
+
+/*
+ * Include queued spinlock definitions and statistics code
+ */
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include "../locking/qspinlock.h"
+#include "../locking/lock_events.h"
+#include "rqspinlock.h"
+#include "../locking/mcs_spinlock.h"
+#endif
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
+ * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
+ * Scott") is available at
+ *
+ * https://bugzilla.kernel.org/show_bug.cgi?id=206115
+ *
+ * This queued spinlock implementation is based on the MCS lock, however to
+ * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
+ * existing API, we must modify it somehow.
+ *
+ * In particular; where the traditional MCS lock consists of a tail pointer
+ * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
+ * unlock the next pending (next->locked), we compress both these: {tail,
+ * next->locked} into a single u32 value.
+ *
+ * Since a spinlock disables recursion of its own context and there is a limit
+ * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
+ * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
+ * we can encode the tail by combining the 2-bit nesting level with the cpu
+ * number. With one byte for the lock value and 3 bytes for the tail, only a
+ * 32-bit word is now needed. Even though we only need 1 bit for the lock,
+ * we extend it to a full byte to achieve better performance for architectures
+ * that support atomic byte write.
+ *
+ * We also change the first spinner to spin on the lock bit instead of its
+ * node; whereby avoiding the need to carry a node from lock to unlock, and
+ * preserving existing lock API. This also makes the unlock code simpler and
+ * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ * atomic operations on smaller 8-bit and 16-bit data types.
+ *
+ */
+
+struct rqspinlock_timeout {
+ u64 timeout_end;
+ u64 duration;
+ u64 cur;
+ u16 spin;
+};
+
+#define RES_TIMEOUT_VAL 2
+
+DEFINE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
+EXPORT_SYMBOL_GPL(rqspinlock_held_locks);
+
+static bool is_lock_released(rqspinlock_t *lock, u32 mask, struct rqspinlock_timeout *ts)
+{
+ if (!(atomic_read_acquire(&lock->val) & (mask)))
+ return true;
+ return false;
+}
+
+static noinline int check_deadlock_AA(rqspinlock_t *lock, u32 mask,
+ struct rqspinlock_timeout *ts)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+ int cnt = min(RES_NR_HELD, rqh->cnt);
+
+ /*
+ * Return an error if we hold the lock we are attempting to acquire.
+ * We'll iterate over max 32 locks; no need to do is_lock_released.
+ */
+ for (int i = 0; i < cnt - 1; i++) {
+ if (rqh->locks[i] == lock)
+ return -EDEADLK;
+ }
+ return 0;
+}
+
+/*
+ * This focuses on the most common case of ABBA deadlocks (or ABBA involving
+ * more locks, which reduce to ABBA). This is not exhaustive, and we rely on
+ * timeouts as the final line of defense.
+ */
+static noinline int check_deadlock_ABBA(rqspinlock_t *lock, u32 mask,
+ struct rqspinlock_timeout *ts)
+{
+ struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
+ int rqh_cnt = min(RES_NR_HELD, rqh->cnt);
+ void *remote_lock;
+ int cpu;
+
+ /*
+ * Find the CPU holding the lock that we want to acquire. If there is a
+ * deadlock scenario, we will read a stable set on the remote CPU and
+ * find the target. This would be a constant time operation instead of
+ * O(NR_CPUS) if we could determine the owning CPU from a lock value, but
+ * that requires increasing the size of the lock word.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rqspinlock_held *rqh_cpu = per_cpu_ptr(&rqspinlock_held_locks, cpu);
+ int real_cnt = READ_ONCE(rqh_cpu->cnt);
+ int cnt = min(RES_NR_HELD, real_cnt);
+
+ /*
+ * Let's ensure to break out of this loop if the lock is available for
+ * us to potentially acquire.
+ */
+ if (is_lock_released(lock, mask, ts))
+ return 0;
+
+ /*
+ * Skip ourselves, and CPUs whose count is less than 2, as they need at
+ * least one held lock and one acquisition attempt (reflected as top
+ * most entry) to participate in an ABBA deadlock.
+ *
+ * If cnt is more than RES_NR_HELD, it means the current lock being
+ * acquired won't appear in the table, and other locks in the table are
+ * already held, so we can't determine ABBA.
+ */
+ if (cpu == smp_processor_id() || real_cnt < 2 || real_cnt > RES_NR_HELD)
+ continue;
+
+ /*
+ * Obtain the entry at the top, this corresponds to the lock the
+ * remote CPU is attempting to acquire in a deadlock situation,
+ * and would be one of the locks we hold on the current CPU.
+ */
+ remote_lock = READ_ONCE(rqh_cpu->locks[cnt - 1]);
+ /*
+ * If it is NULL, we've raced and cannot determine a deadlock
+ * conclusively, skip this CPU.
+ */
+ if (!remote_lock)
+ continue;
+ /*
+ * Find if the lock we're attempting to acquire is held by this CPU.
+ * Don't consider the topmost entry, as that must be the latest lock
+ * being held or acquired. For a deadlock, the target CPU must also
+ * attempt to acquire a lock we hold, so for this search only 'cnt - 1'
+ * entries are important.
+ */
+ for (int i = 0; i < cnt - 1; i++) {
+ if (READ_ONCE(rqh_cpu->locks[i]) != lock)
+ continue;
+ /*
+ * We found our lock as held on the remote CPU. Is the
+ * acquisition attempt on the remote CPU for a lock held
+ * by us? If so, we have a deadlock situation, and need
+ * to recover.
+ */
+ for (int i = 0; i < rqh_cnt - 1; i++) {
+ if (rqh->locks[i] == remote_lock)
+ return -EDEADLK;
+ }
+ /*
+ * Inconclusive; retry again later.
+ */
+ return 0;
+ }
+ }
+ return 0;
+}
+
+static noinline int check_deadlock(rqspinlock_t *lock, u32 mask,
+ struct rqspinlock_timeout *ts)
+{
+ int ret;
+
+ ret = check_deadlock_AA(lock, mask, ts);
+ if (ret)
+ return ret;
+ ret = check_deadlock_ABBA(lock, mask, ts);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static noinline int check_timeout(rqspinlock_t *lock, u32 mask,
+ struct rqspinlock_timeout *ts)
+{
+ u64 time = ktime_get_mono_fast_ns();
+ u64 prev = ts->cur;
+
+ if (!ts->timeout_end) {
+ ts->cur = time;
+ ts->timeout_end = time + ts->duration;
+ return 0;
+ }
+
+ if (time > ts->timeout_end)
+ return -ETIMEDOUT;
+
+ /*
+ * A millisecond interval passed from last time? Trigger deadlock
+ * checks.
+ */
+ if (prev + NSEC_PER_MSEC < time) {
+ ts->cur = time;
+ return check_deadlock(lock, mask, ts);
+ }
+
+ return 0;
+}
+
+/*
+ * Do not amortize with spins when res_smp_cond_load_acquire is defined,
+ * as the macro does internal amortization for us.
+ */
+#ifndef res_smp_cond_load_acquire
+#define RES_CHECK_TIMEOUT(ts, ret, mask) \
+ ({ \
+ if (!(ts).spin++) \
+ (ret) = check_timeout((lock), (mask), &(ts)); \
+ (ret); \
+ })
+#else
+#define RES_CHECK_TIMEOUT(ts, ret, mask) \
+ ({ (ret) = check_timeout(&(ts)); })
+#endif
+
+/*
+ * Initialize the 'spin' member.
+ * Set spin member to 0 to trigger AA/ABBA checks immediately.
+ */
+#define RES_INIT_TIMEOUT(ts) ({ (ts).spin = 0; })
+
+/*
+ * We only need to reset 'timeout_end', 'spin' will just wrap around as necessary.
+ * Duration is defined for each spin attempt, so set it here.
+ */
+#define RES_RESET_TIMEOUT(ts, _duration) ({ (ts).timeout_end = 0; (ts).duration = _duration; })
+
+/*
+ * Provide a test-and-set fallback for cases when queued spin lock support is
+ * absent from the architecture.
+ */
+int __lockfunc resilient_tas_spin_lock(rqspinlock_t *lock)
+{
+ struct rqspinlock_timeout ts;
+ int val, ret = 0;
+
+ RES_INIT_TIMEOUT(ts);
+ grab_held_lock_entry(lock);
+
+ /*
+ * Since the waiting loop's time is dependent on the amount of
+ * contention, a short timeout unlike rqspinlock waiting loops
+ * isn't enough. Choose a second as the timeout value.
+ */
+ RES_RESET_TIMEOUT(ts, NSEC_PER_SEC);
+retry:
+ val = atomic_read(&lock->val);
+
+ if (val || !atomic_try_cmpxchg(&lock->val, &val, 1)) {
+ if (RES_CHECK_TIMEOUT(ts, ret, ~0u))
+ goto out;
+ cpu_relax();
+ goto retry;
+ }
+
+ return 0;
+out:
+ release_held_lock_entry();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(resilient_tas_spin_lock);
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+/*
+ * Per-CPU queue node structures; we can never have more than 4 nested
+ * contexts: task, softirq, hardirq, nmi.
+ *
+ * Exactly fits one 64-byte cacheline on a 64-bit architecture.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]);
+
+#ifndef res_smp_cond_load_acquire
+#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c)
+#endif
+
+#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
+
+/**
+ * resilient_queued_spin_lock_slowpath - acquire the queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ * @val: Current value of the queued spinlock 32-bit word
+ *
+ * Return:
+ * * 0 - Lock was acquired successfully.
+ * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock.
+ * * -ETIMEDOUT - Lock acquisition failed because of timeout.
+ *
+ * (queue tail, pending bit, lock value)
+ *
+ * fast : slow : unlock
+ * : :
+ * uncontended (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
+ * : | ^--------.------. / :
+ * : v \ \ | :
+ * pending : (0,1,1) +--> (0,1,0) \ | :
+ * : | ^--' | | :
+ * : v | | :
+ * uncontended : (n,x,y) +--> (n,0,0) --' | :
+ * queue : | ^--' | :
+ * : v | :
+ * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' :
+ * queue : ^--' :
+ */
+int __lockfunc resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val)
+{
+ struct mcs_spinlock *prev, *next, *node;
+ struct rqspinlock_timeout ts;
+ int idx, ret = 0;
+ u32 old, tail;
+
+ BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
+ if (resilient_virt_spin_lock_enabled())
+ return resilient_virt_spin_lock(lock);
+
+ RES_INIT_TIMEOUT(ts);
+
+ /*
+ * Wait for in-progress pending->locked hand-overs with a bounded
+ * number of spins so that we guarantee forward progress.
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ if (val == _Q_PENDING_VAL) {
+ int cnt = _Q_PENDING_LOOPS;
+ val = atomic_cond_read_relaxed(&lock->val,
+ (VAL != _Q_PENDING_VAL) || !cnt--);
+ }
+
+ /*
+ * If we observe any contention; queue.
+ */
+ if (val & ~_Q_LOCKED_MASK)
+ goto queue;
+
+ /*
+ * trylock || pending
+ *
+ * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
+ */
+ val = queued_fetch_set_pending_acquire(lock);
+
+ /*
+ * If we observe contention, there is a concurrent locker.
+ *
+ * Undo and queue; our setting of PENDING might have made the
+ * n,0,0 -> 0,0,0 transition fail and it will now be waiting
+ * on @next to become !NULL.
+ */
+ if (unlikely(val & ~_Q_LOCKED_MASK)) {
+
+ /* Undo PENDING if we set it. */
+ if (!(val & _Q_PENDING_MASK))
+ clear_pending(lock);
+
+ goto queue;
+ }
+
+ /*
+ * Grab an entry in the held locks array, to enable deadlock detection.
+ */
+ grab_held_lock_entry(lock);
+
+ /*
+ * We're pending, wait for the owner to go away.
+ *
+ * 0,1,1 -> *,1,0
+ *
+ * this wait loop must be a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because not all
+ * clear_pending_set_locked() implementations imply full
+ * barriers.
+ */
+ if (val & _Q_LOCKED_MASK) {
+ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
+ res_smp_cond_load_acquire(&lock->locked, !VAL || RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_MASK));
+ }
+
+ if (ret) {
+ /*
+ * We waited for the locked bit to go back to 0, as the pending
+ * waiter, but timed out. We need to clear the pending bit since
+ * we own it. Once a stuck owner has been recovered, the lock
+ * must be restored to a valid state, hence removing the pending
+ * bit is necessary.
+ *
+ * *,1,* -> *,0,*
+ */
+ clear_pending(lock);
+ lockevent_inc(rqspinlock_lock_timeout);
+ goto err_release_entry;
+ }
+
+ /*
+ * take ownership and clear the pending bit.
+ *
+ * 0,1,0 -> 0,0,1
+ */
+ clear_pending_set_locked(lock);
+ lockevent_inc(lock_pending);
+ return 0;
+
+ /*
+ * End of pending bit optimistic spinning and beginning of MCS
+ * queuing.
+ */
+queue:
+ lockevent_inc(lock_slowpath);
+ /*
+ * Grab deadlock detection entry for the queue path.
+ */
+ grab_held_lock_entry(lock);
+
+ node = this_cpu_ptr(&rqnodes[0].mcs);
+ idx = node->count++;
+ tail = encode_tail(smp_processor_id(), idx);
+
+ trace_contention_begin(lock, LCB_F_SPIN);
+
+ /*
+ * 4 nodes are allocated based on the assumption that there will
+ * not be nested NMIs taking spinlocks. That may not be true in
+ * some architectures even though the chance of needing more than
+ * 4 nodes will still be extremely unlikely. When that happens,
+ * we fall back to spinning on the lock directly without using
+ * any MCS node. This is not the most elegant solution, but is
+ * simple enough.
+ */
+ if (unlikely(idx >= _Q_MAX_NODES)) {
+ lockevent_inc(lock_no_node);
+ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT);
+ while (!queued_spin_trylock(lock)) {
+ if (RES_CHECK_TIMEOUT(ts, ret, ~0u)) {
+ lockevent_inc(rqspinlock_lock_timeout);
+ goto err_release_node;
+ }
+ cpu_relax();
+ }
+ goto release;
+ }
+
+ node = grab_mcs_node(node, idx);
+
+ /*
+ * Keep counts of non-zero index values:
+ */
+ lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
+
+ /*
+ * Ensure that we increment the head node->count before initialising
+ * the actual node. If the compiler is kind enough to reorder these
+ * stores, then an IRQ could overwrite our assignments.
+ */
+ barrier();
+
+ node->locked = 0;
+ node->next = NULL;
+
+ /*
+ * We touched a (possibly) cold cacheline in the per-cpu queue node;
+ * attempt the trylock once more in the hope someone let go while we
+ * weren't watching.
+ */
+ if (queued_spin_trylock(lock))
+ goto release;
+
+ /*
+ * Ensure that the initialisation of @node is complete before we
+ * publish the updated tail via xchg_tail() and potentially link
+ * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+ */
+ smp_wmb();
+
+ /*
+ * Publish the updated tail.
+ * We have already touched the queueing cacheline; don't bother with
+ * pending stuff.
+ *
+ * p,*,* -> n,*,*
+ */
+ old = xchg_tail(lock, tail);
+ next = NULL;
+
+ /*
+ * if there was a previous node; link it and wait until reaching the
+ * head of the waitqueue.
+ */
+ if (old & _Q_TAIL_MASK) {
+ int val;
+
+ prev = decode_tail(old, rqnodes);
+
+ /* Link @node into the waitqueue. */
+ WRITE_ONCE(prev->next, node);
+
+ val = arch_mcs_spin_lock_contended(&node->locked);
+ if (val == RES_TIMEOUT_VAL) {
+ ret = -EDEADLK;
+ goto waitq_timeout;
+ }
+
+ /*
+ * While waiting for the MCS lock, the next pointer may have
+ * been set by another lock waiter. We optimistically load
+ * the next pointer & prefetch the cacheline for writing
+ * to reduce latency in the upcoming MCS unlock operation.
+ */
+ next = READ_ONCE(node->next);
+ if (next)
+ prefetchw(next);
+ }
+
+ /*
+ * we're at the head of the waitqueue, wait for the owner & pending to
+ * go away.
+ *
+ * *,x,y -> *,0,0
+ *
+ * this wait loop must use a load-acquire such that we match the
+ * store-release that clears the locked bit and create lock
+ * sequentiality; this is because the set_locked() function below
+ * does not imply a full barrier.
+ *
+ * We use RES_DEF_TIMEOUT * 2 as the duration, as RES_DEF_TIMEOUT is
+ * meant to span maximum allowed time per critical section, and we may
+ * have both the owner of the lock and the pending bit waiter ahead of
+ * us.
+ */
+ RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT * 2);
+ val = res_atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK) ||
+ RES_CHECK_TIMEOUT(ts, ret, _Q_LOCKED_PENDING_MASK));
+
+waitq_timeout:
+ if (ret) {
+ /*
+ * If the tail is still pointing to us, then we are the final waiter,
+ * and are responsible for resetting the tail back to 0. Otherwise, if
+ * the cmpxchg operation fails, we signal the next waiter to take exit
+ * and try the same. For a waiter with tail node 'n':
+ *
+ * n,*,* -> 0,*,*
+ *
+ * When performing cmpxchg for the whole word (NR_CPUS > 16k), it is
+ * possible locked/pending bits keep changing and we see failures even
+ * when we remain the head of wait queue. However, eventually,
+ * pending bit owner will unset the pending bit, and new waiters
+ * will queue behind us. This will leave the lock owner in
+ * charge, and it will eventually either set locked bit to 0, or
+ * leave it as 1, allowing us to make progress.
+ *
+ * We terminate the whole wait queue for two reasons. Firstly,
+ * we eschew per-waiter timeouts with one applied at the head of
+ * the wait queue. This allows everyone to break out faster
+ * once we've seen the owner / pending waiter not responding for
+ * the timeout duration from the head. Secondly, it avoids
+ * complicated synchronization, because when not leaving in FIFO
+ * order, prev's next pointer needs to be fixed up etc.
+ */
+ if (!try_cmpxchg_tail(lock, tail, 0)) {
+ next = smp_cond_load_relaxed(&node->next, VAL);
+ WRITE_ONCE(next->locked, RES_TIMEOUT_VAL);
+ }
+ lockevent_inc(rqspinlock_lock_timeout);
+ goto err_release_node;
+ }
+
+ /*
+ * claim the lock:
+ *
+ * n,0,0 -> 0,0,1 : lock, uncontended
+ * *,*,0 -> *,*,1 : lock, contended
+ *
+ * If the queue head is the only one in the queue (lock value == tail)
+ * and nobody is pending, clear the tail code and grab the lock.
+ * Otherwise, we only need to grab the lock.
+ */
+
+ /*
+ * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
+ * above wait condition, therefore any concurrent setting of
+ * PENDING will make the uncontended transition fail.
+ */
+ if ((val & _Q_TAIL_MASK) == tail) {
+ if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+ goto release; /* No contention */
+ }
+
+ /*
+ * Either somebody is queued behind us or _Q_PENDING_VAL got set
+ * which will then detect the remaining tail and queue behind us
+ * ensuring we'll see a @next.
+ */
+ set_locked(lock);
+
+ /*
+ * contended path; wait for next if not observed yet, release.
+ */
+ if (!next)
+ next = smp_cond_load_relaxed(&node->next, (VAL));
+
+ arch_mcs_spin_unlock_contended(&next->locked);
+
+release:
+ trace_contention_end(lock, 0);
+
+ /*
+ * release the node
+ */
+ __this_cpu_dec(rqnodes[0].mcs.count);
+ return ret;
+err_release_node:
+ trace_contention_end(lock, ret);
+ __this_cpu_dec(rqnodes[0].mcs.count);
+err_release_entry:
+ release_held_lock_entry();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath);
+
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock)
+{
+ int ret;
+
+ BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock));
+ BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock));
+
+ preempt_disable();
+ ret = res_spin_lock((rqspinlock_t *)lock);
+ if (unlikely(ret)) {
+ preempt_enable();
+ return ret;
+ }
+ return 0;
+}
+
+__bpf_kfunc void bpf_res_spin_unlock(struct bpf_res_spin_lock *lock)
+{
+ res_spin_unlock((rqspinlock_t *)lock);
+ preempt_enable();
+}
+
+__bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag)
+{
+ u64 *ptr = (u64 *)flags__irq_flag;
+ unsigned long flags;
+ int ret;
+
+ preempt_disable();
+ local_irq_save(flags);
+ ret = res_spin_lock((rqspinlock_t *)lock);
+ if (unlikely(ret)) {
+ local_irq_restore(flags);
+ preempt_enable();
+ return ret;
+ }
+ *ptr = flags;
+ return 0;
+}
+
+__bpf_kfunc void bpf_res_spin_unlock_irqrestore(struct bpf_res_spin_lock *lock, unsigned long *flags__irq_flag)
+{
+ u64 *ptr = (u64 *)flags__irq_flag;
+ unsigned long flags = *ptr;
+
+ res_spin_unlock((rqspinlock_t *)lock);
+ local_irq_restore(flags);
+ preempt_enable();
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(rqspinlock_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_res_spin_lock, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_res_spin_unlock)
+BTF_ID_FLAGS(func, bpf_res_spin_lock_irqsave, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_res_spin_unlock_irqrestore)
+BTF_KFUNCS_END(rqspinlock_kfunc_ids)
+
+static const struct btf_kfunc_id_set rqspinlock_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &rqspinlock_kfunc_ids,
+};
+
+static __init int rqspinlock_register_kfuncs(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &rqspinlock_kfunc_set);
+}
+late_initcall(rqspinlock_register_kfuncs);
diff --git a/kernel/bpf/rqspinlock.h b/kernel/bpf/rqspinlock.h
new file mode 100644
index 000000000000..5d8cb1b1aab4
--- /dev/null
+++ b/kernel/bpf/rqspinlock.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Resilient Queued Spin Lock defines
+ *
+ * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
+ *
+ * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
+ */
+#ifndef __LINUX_RQSPINLOCK_H
+#define __LINUX_RQSPINLOCK_H
+
+#include "../locking/qspinlock.h"
+
+/*
+ * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value
+ * @lock: Pointer to queued spinlock structure
+ * @tail: The tail to compare against
+ * @new_tail: The new queue tail code word
+ * Return: Bool to indicate whether the cmpxchg operation succeeded
+ *
+ * This is used by the head of the wait queue to clean up the queue.
+ * Provides relaxed ordering, since observers only rely on initialized
+ * state of the node which was made visible through the xchg_tail operation,
+ * i.e. through the smp_wmb preceding xchg_tail.
+ *
+ * We avoid using 16-bit cmpxchg, which is not available on all architectures.
+ */
+static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail)
+{
+ u32 old, new;
+
+ old = atomic_read(&lock->val);
+ do {
+ /*
+ * Is the tail part we compare to already stale? Fail.
+ */
+ if ((old & _Q_TAIL_MASK) != tail)
+ return false;
+ /*
+ * Encode latest locked/pending state for new tail.
+ */
+ new = (old & _Q_LOCKED_PENDING_MASK) | new_tail;
+ } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
+
+ return true;
+}
+
+#endif /* __LINUX_RQSPINLOCK_H */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e1e42e918ba7..9794446bc8c6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
-int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+static bool can_alloc_pages(void)
+{
+ return preempt_count() == 0 && !irqs_disabled() &&
+ !IS_ENABLED(CONFIG_PREEMPT_RT);
+}
+
+static struct page *__bpf_alloc_page(int nid)
+{
+ if (!can_alloc_pages())
+ return try_alloc_pages(nid, 0);
+
+ return alloc_pages_node(nid,
+ GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
+ | __GFP_NOWARN,
+ 0);
+}
+
+int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
unsigned long nr_pages, struct page **pages)
{
unsigned long i, j;
@@ -582,14 +599,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
old_memcg = set_active_memcg(memcg);
#endif
for (i = 0; i < nr_pages; i++) {
- pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
+ pg = __bpf_alloc_page(nid);
if (pg) {
pages[i] = pg;
continue;
}
for (j = 0; j < i; j++)
- __free_page(pages[j]);
+ free_pages_nolock(pages[j], 0);
ret = -ENOMEM;
break;
}
@@ -648,6 +665,7 @@ void btf_record_free(struct btf_record *rec)
case BPF_RB_ROOT:
case BPF_RB_NODE:
case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
case BPF_TIMER:
case BPF_REFCOUNT:
case BPF_WORKQUEUE:
@@ -700,6 +718,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_RB_ROOT:
case BPF_RB_NODE:
case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
case BPF_TIMER:
case BPF_REFCOUNT:
case BPF_WORKQUEUE:
@@ -777,6 +796,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
switch (fields[i].type) {
case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
break;
case BPF_TIMER:
bpf_timer_cancel_and_free(field_ptr);
@@ -1212,7 +1232,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
return -EINVAL;
map->record = btf_parse_fields(btf, value_type,
- BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
+ BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
@@ -1231,6 +1251,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
case 0:
continue;
case BPF_SPIN_LOCK:
+ case BPF_RES_SPIN_LOCK:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY &&
map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
@@ -1315,7 +1336,7 @@ static bool bpf_net_capable(void)
#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
/* called via syscall */
-static int map_create(union bpf_attr *attr)
+static int map_create(union bpf_attr *attr, bool kernel)
{
const struct bpf_map_ops *ops;
struct bpf_token *token = NULL;
@@ -1505,7 +1526,7 @@ static int map_create(union bpf_attr *attr)
attr->btf_vmlinux_value_type_id;
}
- err = security_bpf_map_create(map, attr, token);
+ err = security_bpf_map_create(map, attr, token, kernel);
if (err)
goto free_map_sec;
@@ -1593,11 +1614,8 @@ struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map)
{
- spin_lock_bh(&map_idr_lock);
- map = __bpf_map_inc_not_zero(map, false);
- spin_unlock_bh(&map_idr_lock);
-
- return map;
+ lockdep_assert(rcu_read_lock_held());
+ return __bpf_map_inc_not_zero(map, false);
}
EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
@@ -2314,6 +2332,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
kvfree(prog->aux->jited_linfo);
kvfree(prog->aux->linfo);
kfree(prog->aux->kfunc_tab);
+ kfree(prog->aux->ctx_arg_info);
if (prog->aux->attach_btf)
btf_put(prog->aux->attach_btf);
@@ -2944,7 +2963,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (err < 0)
goto free_prog;
- err = security_bpf_prog_load(prog, attr, token);
+ err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel);
if (err)
goto free_prog_sec;
@@ -4169,7 +4188,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
#define BPF_F_ATTACH_MASK_BASE \
(BPF_F_ALLOW_OVERRIDE | \
BPF_F_ALLOW_MULTI | \
- BPF_F_REPLACE)
+ BPF_F_REPLACE | \
+ BPF_F_PREORDER)
#define BPF_F_ATTACH_MASK_MPROG \
(BPF_F_REPLACE | \
@@ -4733,6 +4753,8 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.recursion_misses = stats.misses;
info.verified_insns = prog->aux->verified_insns;
+ if (prog->aux->btf)
+ info.btf_id = btf_obj_id(prog->aux->btf);
if (!bpf_capable()) {
info.jited_prog_len = 0;
@@ -4879,8 +4901,6 @@ static int bpf_prog_get_info_by_fd(struct file *file,
}
}
- if (prog->aux->btf)
- info.btf_id = btf_obj_id(prog->aux->btf);
info.attach_btf_id = prog->aux->attach_btf_id;
if (attach_btf)
info.attach_btf_obj_id = btf_obj_id(attach_btf);
@@ -5121,15 +5141,34 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_
return btf_new_fd(attr, uattr, uattr_size);
}
-#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
+#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD fd_by_id_token_fd
static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
{
+ struct bpf_token *token = NULL;
+
if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
+ if (attr->open_flags & ~BPF_F_TOKEN_FD)
+ return -EINVAL;
+
+ if (attr->open_flags & BPF_F_TOKEN_FD) {
+ token = bpf_token_get_from_fd(attr->fd_by_id_token_fd);
+ if (IS_ERR(token))
+ return PTR_ERR(token);
+ if (!bpf_token_allow_cmd(token, BPF_BTF_GET_FD_BY_ID)) {
+ bpf_token_put(token);
+ token = NULL;
+ }
+ }
+
+ if (!bpf_token_capable(token, CAP_SYS_ADMIN)) {
+ bpf_token_put(token);
return -EPERM;
+ }
+
+ bpf_token_put(token);
return btf_get_fd_by_id(attr->btf_id);
}
@@ -5768,13 +5807,13 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
if (copy_from_bpfptr(&attr, uattr, size) != 0)
return -EFAULT;
- err = security_bpf(cmd, &attr, size);
+ err = security_bpf(cmd, &attr, size, uattr.is_kernel);
if (err < 0)
return err;
switch (cmd) {
case BPF_MAP_CREATE:
- err = map_create(&attr);
+ err = map_create(&attr, uattr.is_kernel);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 60611df77957..54c6953a8b84 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -456,7 +456,7 @@ static bool subprog_is_exc_cb(struct bpf_verifier_env *env, int subprog)
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
- return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
+ return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK);
}
static bool type_is_rdonly_mem(u32 type)
@@ -579,6 +579,13 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
insn->imm == BPF_CMPXCHG;
}
+static bool is_atomic_load_insn(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_STX &&
+ BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_LOAD_ACQ;
+}
+
static int __get_spi(s32 off)
{
return (-off - 1) / BPF_REG_SIZE;
@@ -1148,7 +1155,8 @@ static int release_irq_state(struct bpf_verifier_state *state, int id);
static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
struct bpf_kfunc_call_arg_meta *meta,
- struct bpf_reg_state *reg, int insn_idx)
+ struct bpf_reg_state *reg, int insn_idx,
+ int kfunc_class)
{
struct bpf_func_state *state = func(env, reg);
struct bpf_stack_state *slot;
@@ -1170,6 +1178,7 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
st->live |= REG_LIVE_WRITTEN;
st->ref_obj_id = id;
+ st->irq.kfunc_class = kfunc_class;
for (i = 0; i < BPF_REG_SIZE; i++)
slot->slot_type[i] = STACK_IRQ_FLAG;
@@ -1178,7 +1187,8 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
return 0;
}
-static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int kfunc_class)
{
struct bpf_func_state *state = func(env, reg);
struct bpf_stack_state *slot;
@@ -1192,6 +1202,15 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r
slot = &state->stack[spi];
st = &slot->spilled_ptr;
+ if (st->irq.kfunc_class != kfunc_class) {
+ const char *flag_kfunc = st->irq.kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock";
+ const char *used_kfunc = kfunc_class == IRQ_NATIVE_KFUNC ? "native" : "lock";
+
+ verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n",
+ flag_kfunc, used_kfunc);
+ return -EINVAL;
+ }
+
err = release_irq_state(env->cur_state, st->ref_obj_id);
WARN_ON_ONCE(err && err != -EACCES);
if (err) {
@@ -1409,6 +1428,8 @@ static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf
dst->active_preempt_locks = src->active_preempt_locks;
dst->active_rcu_lock = src->active_rcu_lock;
dst->active_irq_id = src->active_irq_id;
+ dst->active_lock_id = src->active_lock_id;
+ dst->active_lock_ptr = src->active_lock_ptr;
return 0;
}
@@ -1508,6 +1529,8 @@ static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum r
s->ptr = ptr;
state->active_locks++;
+ state->active_lock_id = id;
+ state->active_lock_ptr = ptr;
return 0;
}
@@ -1545,18 +1568,37 @@ static void release_reference_state(struct bpf_verifier_state *state, int idx)
return;
}
+static bool find_reference_state(struct bpf_verifier_state *state, int ptr_id)
+{
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++)
+ if (state->refs[i].id == ptr_id)
+ return true;
+
+ return false;
+}
+
static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr)
{
+ void *prev_ptr = NULL;
+ u32 prev_id = 0;
int i;
for (i = 0; i < state->acquired_refs; i++) {
- if (state->refs[i].type != type)
- continue;
- if (state->refs[i].id == id && state->refs[i].ptr == ptr) {
+ if (state->refs[i].type == type && state->refs[i].id == id &&
+ state->refs[i].ptr == ptr) {
release_reference_state(state, i);
state->active_locks--;
+ /* Reassign active lock (id, ptr). */
+ state->active_lock_id = prev_id;
+ state->active_lock_ptr = prev_ptr;
return 0;
}
+ if (state->refs[i].type & REF_TYPE_LOCK_MASK) {
+ prev_id = state->refs[i].id;
+ prev_ptr = state->refs[i].ptr;
+ }
}
return -EINVAL;
}
@@ -1591,7 +1633,7 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st
for (i = 0; i < state->acquired_refs; i++) {
struct bpf_reference_state *s = &state->refs[i];
- if (s->type != type)
+ if (!(s->type & type))
continue;
if (s->id == id && s->ptr == ptr)
@@ -1600,6 +1642,14 @@ static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *st
return NULL;
}
+static void update_peak_states(struct bpf_verifier_env *env)
+{
+ u32 cur_states;
+
+ cur_states = env->explored_states_size + env->free_list_size;
+ env->peak_states = max(env->peak_states, cur_states);
+}
+
static void free_func_state(struct bpf_func_state *state)
{
if (!state)
@@ -1622,6 +1672,50 @@ static void free_verifier_state(struct bpf_verifier_state *state,
kfree(state);
}
+/* struct bpf_verifier_state->{parent,loop_entry} refer to states
+ * that are in either of env->{expored_states,free_list}.
+ * In both cases the state is contained in struct bpf_verifier_state_list.
+ */
+static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_state *st)
+{
+ if (st->parent)
+ return container_of(st->parent, struct bpf_verifier_state_list, state);
+ return NULL;
+}
+
+static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st)
+{
+ if (st->loop_entry)
+ return container_of(st->loop_entry, struct bpf_verifier_state_list, state);
+ return NULL;
+}
+
+/* A state can be freed if it is no longer referenced:
+ * - is in the env->free_list;
+ * - has no children states;
+ * - is not used as loop_entry.
+ *
+ * Freeing a state can make it's loop_entry free-able.
+ */
+static void maybe_free_verifier_state(struct bpf_verifier_env *env,
+ struct bpf_verifier_state_list *sl)
+{
+ struct bpf_verifier_state_list *loop_entry_sl;
+
+ while (sl && sl->in_free_list &&
+ sl->state.branches == 0 &&
+ sl->state.used_as_loop_entry == 0) {
+ loop_entry_sl = state_loop_entry_as_list(&sl->state);
+ if (loop_entry_sl)
+ loop_entry_sl->state.used_as_loop_entry--;
+ list_del(&sl->node);
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ env->free_list_size--;
+ sl = loop_entry_sl;
+ }
+}
+
/* copy verifier state from src to dst growing dst stack space
* when necessary to accommodate larger src stack
*/
@@ -1661,6 +1755,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->callback_unroll_depth = src->callback_unroll_depth;
dst_state->used_as_loop_entry = src->used_as_loop_entry;
dst_state->may_goto_depth = src->may_goto_depth;
+ dst_state->loop_entry = src->loop_entry;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -1681,7 +1776,7 @@ static u32 state_htab_size(struct bpf_verifier_env *env)
return env->prog->len;
}
-static struct bpf_verifier_state_list **explored_state(struct bpf_verifier_env *env, int idx)
+static struct list_head *explored_state(struct bpf_verifier_env *env, int idx)
{
struct bpf_verifier_state *cur = env->cur_state;
struct bpf_func_state *state = cur->frame[cur->curframe];
@@ -1789,16 +1884,13 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
* # Find outermost loop entry known for n
* def get_loop_entry(n):
* h = entries.get(n, None)
- * while h in entries and entries[h] != h:
+ * while h in entries:
* h = entries[h]
* return h
*
- * # Update n's loop entry if h's outermost entry comes
- * # before n's outermost entry in current DFS path.
+ * # Update n's loop entry if h comes before n in current DFS path.
* def update_loop_entry(n, h):
- * n1 = get_loop_entry(n) or n
- * h1 = get_loop_entry(h) or h
- * if h1 in path and depths[h1] <= depths[n1]:
+ * if h in path and depths[entries.get(n, n)] < depths[n]:
* entries[n] = h1
*
* def dfs(n, depth):
@@ -1810,7 +1902,7 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
* # Case A: explore succ and update cur's loop entry
* # only if succ's entry is in current DFS path.
* dfs(succ, depth + 1)
- * h = get_loop_entry(succ)
+ * h = entries.get(succ, None)
* update_loop_entry(n, h)
* else:
* # Case B or C depending on `h1 in path` check in update_loop_entry().
@@ -1822,46 +1914,49 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta
* and cur's loop entry has to be updated (case A), handle this in
* update_branch_counts();
* - use st->branch > 0 as a signal that st is in the current DFS path;
- * - handle cases B and C in is_state_visited();
- * - update topmost loop entry for intermediate states in get_loop_entry().
+ * - handle cases B and C in is_state_visited().
*/
-static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_state *st)
+static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
{
- struct bpf_verifier_state *topmost = st->loop_entry, *old;
+ struct bpf_verifier_state *topmost = st->loop_entry;
+ u32 steps = 0;
- while (topmost && topmost->loop_entry && topmost != topmost->loop_entry)
+ while (topmost && topmost->loop_entry) {
+ if (steps++ > st->dfs_depth) {
+ WARN_ONCE(true, "verifier bug: infinite loop in get_loop_entry\n");
+ verbose(env, "verifier bug: infinite loop in get_loop_entry()\n");
+ return ERR_PTR(-EFAULT);
+ }
topmost = topmost->loop_entry;
- /* Update loop entries for intermediate states to avoid this
- * traversal in future get_loop_entry() calls.
- */
- while (st && st->loop_entry != topmost) {
- old = st->loop_entry;
- st->loop_entry = topmost;
- st = old;
}
return topmost;
}
-static void update_loop_entry(struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
+static void update_loop_entry(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr)
{
- struct bpf_verifier_state *cur1, *hdr1;
-
- cur1 = get_loop_entry(cur) ?: cur;
- hdr1 = get_loop_entry(hdr) ?: hdr;
- /* The head1->branches check decides between cases B and C in
- * comment for get_loop_entry(). If hdr1->branches == 0 then
+ /* The hdr->branches check decides between cases B and C in
+ * comment for get_loop_entry(). If hdr->branches == 0 then
* head's topmost loop entry is not in current DFS path,
* hence 'cur' and 'hdr' are not in the same loop and there is
* no need to update cur->loop_entry.
*/
- if (hdr1->branches && hdr1->dfs_depth <= cur1->dfs_depth) {
+ if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) {
+ if (cur->loop_entry) {
+ cur->loop_entry->used_as_loop_entry--;
+ maybe_free_verifier_state(env, state_loop_entry_as_list(cur));
+ }
cur->loop_entry = hdr;
- hdr->used_as_loop_entry = true;
+ hdr->used_as_loop_entry++;
}
}
static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
{
+ struct bpf_verifier_state_list *sl = NULL, *parent_sl;
+ struct bpf_verifier_state *parent;
+
while (st) {
u32 br = --st->branches;
@@ -1871,7 +1966,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
* This is a part of 'case A' in get_loop_entry() comment.
*/
if (br == 0 && st->parent && st->loop_entry)
- update_loop_entry(st->parent, st->loop_entry);
+ update_loop_entry(env, st->parent, st->loop_entry);
/* WARN_ON(br > 1) technically makes sense here,
* but see comment in push_stack(), hence:
@@ -1881,7 +1976,12 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi
br);
if (br)
break;
- st = st->parent;
+ parent = st->parent;
+ parent_sl = state_parent_as_list(st);
+ if (sl)
+ maybe_free_verifier_state(env, sl);
+ st = parent;
+ sl = parent_sl;
}
}
@@ -3206,6 +3306,21 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
return res ? &res->func_model : NULL;
}
+static int add_kfunc_in_insns(struct bpf_verifier_env *env,
+ struct bpf_insn *insn, int cnt)
+{
+ int i, ret;
+
+ for (i = 0; i < cnt; i++, insn++) {
+ if (bpf_pseudo_kfunc_call(insn)) {
+ ret = add_kfunc_call(env, insn->imm, insn->off);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
{
struct bpf_subprog_info *subprog = env->subprog_info;
@@ -3269,6 +3384,15 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
return 0;
}
+static int jmp_offset(struct bpf_insn *insn)
+{
+ u8 code = insn->code;
+
+ if (code == (BPF_JMP32 | BPF_JA))
+ return insn->imm;
+ return insn->off;
+}
+
static int check_subprogs(struct bpf_verifier_env *env)
{
int i, subprog_start, subprog_end, off, cur_subprog = 0;
@@ -3295,10 +3419,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
- if (code == (BPF_JMP32 | BPF_JA))
- off = i + insn[i].imm + 1;
- else
- off = i + insn[i].off + 1;
+ off = i + jmp_offset(&insn[i]) + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
return -EINVAL;
@@ -3483,7 +3604,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
if (class == BPF_STX) {
- /* BPF_STX (including atomic variants) has multiple source
+ /* BPF_STX (including atomic variants) has one or more source
* operands, one of which is a ptr. Check whether the caller is
* asking about it.
*/
@@ -3828,6 +3949,17 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
return btf_name_by_offset(desc_btf, func->name_off);
}
+static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ const struct bpf_insn_cbs cbs = {
+ .cb_call = disasm_kfunc_name,
+ .cb_print = verbose,
+ .private_data = env,
+ };
+
+ print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+}
+
static inline void bt_init(struct backtrack_state *bt, u32 frame)
{
bt->frame = frame;
@@ -4028,11 +4160,6 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
struct bpf_insn_hist_entry *hist, struct backtrack_state *bt)
{
- const struct bpf_insn_cbs cbs = {
- .cb_call = disasm_kfunc_name,
- .cb_print = verbose,
- .private_data = env,
- };
struct bpf_insn *insn = env->prog->insnsi + idx;
u8 class = BPF_CLASS(insn->code);
u8 opcode = BPF_OP(insn->code);
@@ -4050,7 +4177,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt));
verbose(env, "stack=%s before ", env->tmp_str_buf);
verbose(env, "%d: ", idx);
- print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+ verbose_insn(env, insn);
}
/* If there is a history record that some registers gained range at this insn,
@@ -4097,7 +4224,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
* dreg still needs precision before this insn
*/
}
- } else if (class == BPF_LDX) {
+ } else if (class == BPF_LDX || is_atomic_load_insn(insn)) {
if (!bt_is_reg_set(bt, dreg))
return 0;
bt_clear_reg(bt, dreg);
@@ -5982,18 +6109,10 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
/* check access to 'struct bpf_context' fields. Supports fixed offsets only */
static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
- enum bpf_access_type t, enum bpf_reg_type *reg_type,
- struct btf **btf, u32 *btf_id, bool *is_retval, bool is_ldsx)
+ enum bpf_access_type t, struct bpf_insn_access_aux *info)
{
- struct bpf_insn_access_aux info = {
- .reg_type = *reg_type,
- .log = &env->log,
- .is_retval = false,
- .is_ldsx = is_ldsx,
- };
-
if (env->ops->is_valid_access &&
- env->ops->is_valid_access(off, size, t, env->prog, &info)) {
+ env->ops->is_valid_access(off, size, t, env->prog, info)) {
/* A non zero info.ctx_field_size indicates that this field is a
* candidate for later verifier transformation to load the whole
* field and then apply a mask when accessed with a narrower
@@ -6001,14 +6120,15 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
* will only allow for whole field access and rejects any other
* type of narrower access.
*/
- *reg_type = info.reg_type;
- *is_retval = info.is_retval;
-
- if (base_type(*reg_type) == PTR_TO_BTF_ID) {
- *btf = info.btf;
- *btf_id = info.btf_id;
+ if (base_type(info->reg_type) == PTR_TO_BTF_ID) {
+ if (info->ref_obj_id &&
+ !find_reference_state(env->cur_state, info->ref_obj_id)) {
+ verbose(env, "invalid bpf_context access off=%d. Reference may already be released\n",
+ off);
+ return -EACCES;
+ }
} else {
- env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+ env->insn_aux_data[insn_idx].ctx_field_size = info->ctx_field_size;
}
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
@@ -6118,6 +6238,26 @@ static bool is_arena_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_ARENA;
}
+/* Return false if @regno contains a pointer whose type isn't supported for
+ * atomic instruction @insn.
+ */
+static bool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno,
+ struct bpf_insn *insn)
+{
+ if (is_ctx_reg(env, regno))
+ return false;
+ if (is_pkt_reg(env, regno))
+ return false;
+ if (is_flow_key_reg(env, regno))
+ return false;
+ if (is_sk_reg(env, regno))
+ return false;
+ if (is_arena_reg(env, regno))
+ return bpf_jit_supports_insn(insn, true);
+
+ return true;
+}
+
static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
#ifdef CONFIG_NET
[PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
@@ -7365,11 +7505,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
- bool is_retval = false;
struct bpf_retval_range range;
- enum bpf_reg_type reg_type = SCALAR_VALUE;
- struct btf *btf = NULL;
- u32 btf_id = 0;
+ struct bpf_insn_access_aux info = {
+ .reg_type = SCALAR_VALUE,
+ .is_ldsx = is_ldsx,
+ .log = &env->log,
+ };
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
@@ -7381,8 +7522,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err < 0)
return err;
- err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
- &btf_id, &is_retval, is_ldsx);
+ err = check_ctx_access(env, insn_idx, off, size, t, &info);
if (err)
verbose_linfo(env, insn_idx, "; ");
if (!err && t == BPF_READ && value_regno >= 0) {
@@ -7390,8 +7530,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* PTR_TO_PACKET[_META,_END]. In the latter
* case, we know the offset is zero.
*/
- if (reg_type == SCALAR_VALUE) {
- if (is_retval && get_func_retval_range(env->prog, &range)) {
+ if (info.reg_type == SCALAR_VALUE) {
+ if (info.is_retval && get_func_retval_range(env->prog, &range)) {
err = __mark_reg_s32_range(env, regs, value_regno,
range.minval, range.maxval);
if (err)
@@ -7402,7 +7542,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else {
mark_reg_known_zero(env, regs,
value_regno);
- if (type_may_be_null(reg_type))
+ if (type_may_be_null(info.reg_type))
regs[value_regno].id = ++env->id_gen;
/* A load of ctx field could have different
* actual load size with the one encoded in the
@@ -7410,12 +7550,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
- if (base_type(reg_type) == PTR_TO_BTF_ID) {
- regs[value_regno].btf = btf;
- regs[value_regno].btf_id = btf_id;
+ if (base_type(info.reg_type) == PTR_TO_BTF_ID) {
+ regs[value_regno].btf = info.btf;
+ regs[value_regno].btf_id = info.btf_id;
+ regs[value_regno].ref_obj_id = info.ref_obj_id;
}
}
- regs[value_regno].type = reg_type;
+ regs[value_regno].type = info.reg_type;
}
} else if (reg->type == PTR_TO_STACK) {
@@ -7518,27 +7659,72 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
bool allow_trust_mismatch);
-static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
+static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ bool strict_alignment_once, bool is_ldsx,
+ bool allow_trust_mismatch, const char *ctx)
{
- int load_reg;
+ struct bpf_reg_state *regs = cur_regs(env);
+ enum bpf_reg_type src_reg_type;
int err;
- switch (insn->imm) {
- case BPF_ADD:
- case BPF_ADD | BPF_FETCH:
- case BPF_AND:
- case BPF_AND | BPF_FETCH:
- case BPF_OR:
- case BPF_OR | BPF_FETCH:
- case BPF_XOR:
- case BPF_XOR | BPF_FETCH:
- case BPF_XCHG:
- case BPF_CMPXCHG:
- break;
- default:
- verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm);
- return -EINVAL;
- }
+ /* check src operand */
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check dst operand */
+ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
+ if (err)
+ return err;
+
+ src_reg_type = regs[insn->src_reg].type;
+
+ /* Check if (src_reg + off) is readable. The state of dst_reg will be
+ * updated by this call.
+ */
+ err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, insn->dst_reg,
+ strict_alignment_once, is_ldsx);
+ err = err ?: save_aux_ptr_type(env, src_reg_type,
+ allow_trust_mismatch);
+ err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], ctx);
+
+ return err;
+}
+
+static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ bool strict_alignment_once)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ enum bpf_reg_type dst_reg_type;
+ int err;
+
+ /* check src1 operand */
+ err = check_reg_arg(env, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check src2 operand */
+ err = check_reg_arg(env, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg_type = regs[insn->dst_reg].type;
+
+ /* Check if (dst_reg + off) is writeable. */
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg,
+ strict_alignment_once, false);
+ err = err ?: save_aux_ptr_type(env, dst_reg_type, false);
+
+ return err;
+}
+
+static int check_atomic_rmw(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ int load_reg;
+ int err;
if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) {
verbose(env, "invalid atomic operand size\n");
@@ -7574,11 +7760,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
return -EACCES;
}
- if (is_ctx_reg(env, insn->dst_reg) ||
- is_pkt_reg(env, insn->dst_reg) ||
- is_flow_key_reg(env, insn->dst_reg) ||
- is_sk_reg(env, insn->dst_reg) ||
- (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) {
+ if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) {
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
reg_type_str(env, reg_state(env, insn->dst_reg)->type));
@@ -7605,12 +7787,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
/* Check whether we can read the memory, with second call for fetch
* case to simulate the register fill.
*/
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1, true, false);
if (!err && load_reg >= 0)
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, load_reg,
- true, false);
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_READ, load_reg, true, false);
if (err)
return err;
@@ -7620,13 +7802,86 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
return err;
}
/* Check whether we can write into the same memory. */
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
if (err)
return err;
return 0;
}
+static int check_atomic_load(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ int err;
+
+ err = check_load_mem(env, insn, true, false, false, "atomic_load");
+ if (err)
+ return err;
+
+ if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) {
+ verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n",
+ insn->src_reg,
+ reg_type_str(env, reg_state(env, insn->src_reg)->type));
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_atomic_store(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ int err;
+
+ err = check_store_reg(env, insn, true);
+ if (err)
+ return err;
+
+ if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) {
+ verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
+ insn->dst_reg,
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ switch (insn->imm) {
+ case BPF_ADD:
+ case BPF_ADD | BPF_FETCH:
+ case BPF_AND:
+ case BPF_AND | BPF_FETCH:
+ case BPF_OR:
+ case BPF_OR | BPF_FETCH:
+ case BPF_XOR:
+ case BPF_XOR | BPF_FETCH:
+ case BPF_XCHG:
+ case BPF_CMPXCHG:
+ return check_atomic_rmw(env, insn);
+ case BPF_LOAD_ACQ:
+ if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) {
+ verbose(env,
+ "64-bit load-acquires are only supported on 64-bit arches\n");
+ return -EOPNOTSUPP;
+ }
+ return check_atomic_load(env, insn);
+ case BPF_STORE_REL:
+ if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) {
+ verbose(env,
+ "64-bit store-releases are only supported on 64-bit arches\n");
+ return -EOPNOTSUPP;
+ }
+ return check_atomic_store(env, insn);
+ default:
+ verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n",
+ insn->imm);
+ return -EINVAL;
+ }
+}
+
/* When register 'regno' is used to read the stack (either directly or through
* a helper function) make sure that it's within stack boundary and, depending
* on the access type and privileges, that all elements of the stack are
@@ -7985,6 +8240,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg
return err;
}
+enum {
+ PROCESS_SPIN_LOCK = (1 << 0),
+ PROCESS_RES_LOCK = (1 << 1),
+ PROCESS_LOCK_IRQ = (1 << 2),
+};
+
/* Implementation details:
* bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL.
* bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.
@@ -8007,30 +8268,33 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg
* env->cur_state->active_locks remembers which map value element or allocated
* object got locked and clears it after bpf_spin_unlock.
*/
-static int process_spin_lock(struct bpf_verifier_env *env, int regno,
- bool is_lock)
+static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags)
{
+ bool is_lock = flags & PROCESS_SPIN_LOCK, is_res_lock = flags & PROCESS_RES_LOCK;
+ const char *lock_str = is_res_lock ? "bpf_res_spin" : "bpf_spin";
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
+ bool is_irq = flags & PROCESS_LOCK_IRQ;
u64 val = reg->var_off.value;
struct bpf_map *map = NULL;
struct btf *btf = NULL;
struct btf_record *rec;
+ u32 spin_lock_off;
int err;
if (!is_const) {
verbose(env,
- "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
- regno);
+ "R%d doesn't have constant offset. %s_lock has to be at the constant offset\n",
+ regno, lock_str);
return -EINVAL;
}
if (reg->type == PTR_TO_MAP_VALUE) {
map = reg->map_ptr;
if (!map->btf) {
verbose(env,
- "map '%s' has to have BTF in order to use bpf_spin_lock\n",
- map->name);
+ "map '%s' has to have BTF in order to use %s_lock\n",
+ map->name, lock_str);
return -EINVAL;
}
} else {
@@ -8038,36 +8302,53 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
}
rec = reg_btf_record(reg);
- if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) {
- verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local",
- map ? map->name : "kptr");
+ if (!btf_record_has_field(rec, is_res_lock ? BPF_RES_SPIN_LOCK : BPF_SPIN_LOCK)) {
+ verbose(env, "%s '%s' has no valid %s_lock\n", map ? "map" : "local",
+ map ? map->name : "kptr", lock_str);
return -EINVAL;
}
- if (rec->spin_lock_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n",
- val + reg->off, rec->spin_lock_off);
+ spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off;
+ if (spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n",
+ val + reg->off, lock_str, spin_lock_off);
return -EINVAL;
}
if (is_lock) {
void *ptr;
+ int type;
if (map)
ptr = map;
else
ptr = btf;
- if (cur->active_locks) {
- verbose(env,
- "Locking two bpf_spin_locks are not allowed\n");
- return -EINVAL;
+ if (!is_res_lock && cur->active_locks) {
+ if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) {
+ verbose(env,
+ "Locking two bpf_spin_locks are not allowed\n");
+ return -EINVAL;
+ }
+ } else if (is_res_lock && cur->active_locks) {
+ if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) {
+ verbose(env, "Acquiring the same lock again, AA deadlock detected\n");
+ return -EINVAL;
+ }
}
- err = acquire_lock_state(env, env->insn_idx, REF_TYPE_LOCK, reg->id, ptr);
+
+ if (is_res_lock && is_irq)
+ type = REF_TYPE_RES_LOCK_IRQ;
+ else if (is_res_lock)
+ type = REF_TYPE_RES_LOCK;
+ else
+ type = REF_TYPE_LOCK;
+ err = acquire_lock_state(env, env->insn_idx, type, reg->id, ptr);
if (err < 0) {
verbose(env, "Failed to acquire lock state\n");
return err;
}
} else {
void *ptr;
+ int type;
if (map)
ptr = map;
@@ -8075,12 +8356,26 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
ptr = btf;
if (!cur->active_locks) {
- verbose(env, "bpf_spin_unlock without taking a lock\n");
+ verbose(env, "%s_unlock without taking a lock\n", lock_str);
return -EINVAL;
}
- if (release_lock_state(env->cur_state, REF_TYPE_LOCK, reg->id, ptr)) {
- verbose(env, "bpf_spin_unlock of different lock\n");
+ if (is_res_lock && is_irq)
+ type = REF_TYPE_RES_LOCK_IRQ;
+ else if (is_res_lock)
+ type = REF_TYPE_RES_LOCK;
+ else
+ type = REF_TYPE_LOCK;
+ if (!find_lock_state(cur, type, reg->id, ptr)) {
+ verbose(env, "%s_unlock of different lock\n", lock_str);
+ return -EINVAL;
+ }
+ if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) {
+ verbose(env, "%s_unlock cannot be out of order\n", lock_str);
+ return -EINVAL;
+ }
+ if (release_lock_state(cur, type, reg->id, ptr)) {
+ verbose(env, "%s_unlock of different lock\n", lock_str);
return -EINVAL;
}
@@ -8431,10 +8726,12 @@ static struct bpf_verifier_state *find_prev_entry(struct bpf_verifier_env *env,
{
struct bpf_verifier_state_list *sl;
struct bpf_verifier_state *st;
+ struct list_head *pos, *head;
/* Explored states are pushed in stack order, most recent states come first */
- sl = *explored_state(env, insn_idx);
- for (; sl; sl = sl->next) {
+ head = explored_state(env, insn_idx);
+ list_for_each(pos, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
/* If st->branches != 0 state is a part of current DFS verification path,
* hence cur & st for a loop.
*/
@@ -9404,11 +9701,11 @@ skip_type_check:
return -EACCES;
}
if (meta->func_id == BPF_FUNC_spin_lock) {
- err = process_spin_lock(env, regno, true);
+ err = process_spin_lock(env, regno, PROCESS_SPIN_LOCK);
if (err)
return err;
} else if (meta->func_id == BPF_FUNC_spin_unlock) {
- err = process_spin_lock(env, regno, false);
+ err = process_spin_lock(env, regno, 0);
if (err)
return err;
} else {
@@ -9666,7 +9963,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
goto error;
if (env->subprog_cnt > 1 && !allow_tail_call_in_subprogs(env)) {
- verbose(env, "tail_calls are not allowed in non-JITed programs with bpf-to-bpf calls\n");
+ verbose(env, "mixing of tail_calls and bpf-to-bpf calls is not supported\n");
return -EINVAL;
}
break;
@@ -10237,23 +10534,18 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (subprog_is_global(env, subprog)) {
const char *sub_name = subprog_name(env, subprog);
- /* Only global subprogs cannot be called with a lock held. */
if (env->cur_state->active_locks) {
verbose(env, "global function calls are not allowed while holding a lock,\n"
"use static function instead\n");
return -EINVAL;
}
- /* Only global subprogs cannot be called with preemption disabled. */
- if (env->cur_state->active_preempt_locks) {
- verbose(env, "global function calls are not allowed with preemption disabled,\n"
- "use static function instead\n");
- return -EINVAL;
- }
-
- if (env->cur_state->active_irq_id) {
- verbose(env, "global function calls are not allowed with IRQs disabled,\n"
- "use static function instead\n");
+ if (env->subprog_info[subprog].might_sleep &&
+ (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks ||
+ env->cur_state->active_irq_id || !in_sleepable(env))) {
+ verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n"
+ "i.e., in a RCU/IRQ/preempt-disabled section, or in\n"
+ "a non-sleepable BPF program context\n");
return -EINVAL;
}
@@ -10752,6 +11044,8 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
{
struct bpf_verifier_state *state = env->cur_state;
+ enum bpf_prog_type type = resolve_prog_type(env->prog);
+ struct bpf_reg_state *reg = reg_state(env, BPF_REG_0);
bool refs_lingering = false;
int i;
@@ -10761,6 +11055,12 @@ static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exi
for (i = 0; i < state->acquired_refs; i++) {
if (state->refs[i].type != REF_TYPE_PTR)
continue;
+ /* Allow struct_ops programs to return a referenced kptr back to
+ * kernel. Type checks are performed later in check_return_code.
+ */
+ if (type == BPF_PROG_TYPE_STRUCT_OPS && !exception_exit &&
+ reg->ref_obj_id == state->refs[i].id)
+ continue;
verbose(env, "Unreleased reference id=%d alloc_insn=%d\n",
state->refs[i].id, state->refs[i].insn_idx);
refs_lingering = true;
@@ -11287,7 +11587,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].map_uid = meta.map_uid;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
if (!type_may_be_null(ret_flag) &&
- btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
+ btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
break;
@@ -11459,10 +11759,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* mark_btf_func_reg_size() is used when the reg size is determined by
* the BTF func_proto's return value size and argument.
*/
-static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
- size_t reg_size)
+static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_reg_state *regs,
+ u32 regno, size_t reg_size)
{
- struct bpf_reg_state *reg = &cur_regs(env)[regno];
+ struct bpf_reg_state *reg = &regs[regno];
if (regno == BPF_REG_0) {
/* Function return value */
@@ -11480,6 +11780,12 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
}
}
+static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
+ size_t reg_size)
+{
+ return __mark_btf_func_reg_size(env, cur_regs(env), regno, reg_size);
+}
+
static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
{
return meta->kfunc_flags & KF_ACQUIRE;
@@ -11617,6 +11923,7 @@ enum {
KF_ARG_RB_ROOT_ID,
KF_ARG_RB_NODE_ID,
KF_ARG_WORKQUEUE_ID,
+ KF_ARG_RES_SPIN_LOCK_ID,
};
BTF_ID_LIST(kf_arg_btf_ids)
@@ -11626,6 +11933,7 @@ BTF_ID(struct, bpf_list_node)
BTF_ID(struct, bpf_rb_root)
BTF_ID(struct, bpf_rb_node)
BTF_ID(struct, bpf_wq)
+BTF_ID(struct, bpf_res_spin_lock)
static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
const struct btf_param *arg, int type)
@@ -11674,6 +11982,11 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg)
return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID);
}
+static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID);
+}
+
static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf,
const struct btf_param *arg)
{
@@ -11745,6 +12058,7 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_MAP,
KF_ARG_PTR_TO_WORKQUEUE,
KF_ARG_PTR_TO_IRQ_FLAG,
+ KF_ARG_PTR_TO_RES_SPIN_LOCK,
};
enum special_kfunc_type {
@@ -11781,6 +12095,12 @@ enum special_kfunc_type {
KF_bpf_iter_num_new,
KF_bpf_iter_num_next,
KF_bpf_iter_num_destroy,
+ KF_bpf_set_dentry_xattr,
+ KF_bpf_remove_dentry_xattr,
+ KF_bpf_res_spin_lock,
+ KF_bpf_res_spin_unlock,
+ KF_bpf_res_spin_lock_irqsave,
+ KF_bpf_res_spin_unlock_irqrestore,
};
BTF_SET_START(special_kfunc_set)
@@ -11810,6 +12130,10 @@ BTF_ID(func, bpf_wq_set_callback_impl)
#ifdef CONFIG_CGROUPS
BTF_ID(func, bpf_iter_css_task_new)
#endif
+#ifdef CONFIG_BPF_LSM
+BTF_ID(func, bpf_set_dentry_xattr)
+BTF_ID(func, bpf_remove_dentry_xattr)
+#endif
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
@@ -11859,6 +12183,17 @@ BTF_ID(func, bpf_local_irq_restore)
BTF_ID(func, bpf_iter_num_new)
BTF_ID(func, bpf_iter_num_next)
BTF_ID(func, bpf_iter_num_destroy)
+#ifdef CONFIG_BPF_LSM
+BTF_ID(func, bpf_set_dentry_xattr)
+BTF_ID(func, bpf_remove_dentry_xattr)
+#else
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+#endif
+BTF_ID(func, bpf_res_spin_lock)
+BTF_ID(func, bpf_res_spin_unlock)
+BTF_ID(func, bpf_res_spin_lock_irqsave)
+BTF_ID(func, bpf_res_spin_unlock_irqrestore)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -11952,6 +12287,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_irq_flag(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_IRQ_FLAG;
+ if (is_kfunc_arg_res_spin_lock(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_RES_SPIN_LOCK;
+
if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
if (!btf_type_is_struct(ref_t)) {
verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
@@ -12059,13 +12397,19 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno,
struct bpf_kfunc_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ int err, kfunc_class = IRQ_NATIVE_KFUNC;
bool irq_save;
- int err;
- if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) {
+ if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]) {
irq_save = true;
- } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) {
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])
+ kfunc_class = IRQ_LOCK_KFUNC;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) {
irq_save = false;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
+ kfunc_class = IRQ_LOCK_KFUNC;
} else {
verbose(env, "verifier internal error: unknown irq flags kfunc\n");
return -EFAULT;
@@ -12081,7 +12425,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno,
if (err)
return err;
- err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx);
+ err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx, kfunc_class);
if (err)
return err;
} else {
@@ -12095,7 +12439,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno,
if (err)
return err;
- err = unmark_stack_slot_irq_flag(env, reg);
+ err = unmark_stack_slot_irq_flag(env, reg, kfunc_class);
if (err)
return err;
}
@@ -12222,7 +12566,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
if (!env->cur_state->active_locks)
return -EINVAL;
- s = find_lock_state(env->cur_state, REF_TYPE_LOCK, id, ptr);
+ s = find_lock_state(env->cur_state, REF_TYPE_LOCK_MASK, id, ptr);
if (!s) {
verbose(env, "held lock and object are not in the same allocation\n");
return -EINVAL;
@@ -12258,9 +12602,18 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id)
btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
}
+static bool is_bpf_res_spin_lock_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ btf_id == special_kfunc_list[KF_bpf_res_spin_unlock] ||
+ btf_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
+ btf_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore];
+}
+
static bool kfunc_spin_allowed(u32 btf_id)
{
- return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id);
+ return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id) ||
+ is_bpf_res_spin_lock_kfunc(btf_id);
}
static bool is_sync_callback_calling_kfunc(u32 btf_id)
@@ -12692,6 +13045,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_CONST_STR:
case KF_ARG_PTR_TO_WORKQUEUE:
case KF_ARG_PTR_TO_IRQ_FLAG:
+ case KF_ARG_PTR_TO_RES_SPIN_LOCK:
break;
default:
WARN_ON_ONCE(1);
@@ -12990,6 +13344,28 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (ret < 0)
return ret;
break;
+ case KF_ARG_PTR_TO_RES_SPIN_LOCK:
+ {
+ int flags = PROCESS_RES_LOCK;
+
+ if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d doesn't point to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+
+ if (!is_bpf_res_spin_lock_kfunc(meta->func_id))
+ return -EFAULT;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])
+ flags |= PROCESS_SPIN_LOCK;
+ if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave] ||
+ meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore])
+ flags |= PROCESS_LOCK_IRQ;
+ ret = process_spin_lock(env, regno, flags);
+ if (ret < 0)
+ return ret;
+ break;
+ }
}
}
@@ -13075,6 +13451,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+ if (!insn->off &&
+ (insn->imm == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ insn->imm == special_kfunc_list[KF_bpf_res_spin_lock_irqsave])) {
+ struct bpf_verifier_state *branch;
+ struct bpf_reg_state *regs;
+
+ branch = push_stack(env, env->insn_idx + 1, env->insn_idx, false);
+ if (!branch) {
+ verbose(env, "failed to push state for failed lock acquisition\n");
+ return -ENOMEM;
+ }
+
+ regs = branch->frame[branch->curframe]->regs;
+
+ /* Clear r0-r5 registers in forked state */
+ for (i = 0; i < CALLER_SAVED_REGS; i++)
+ mark_reg_not_init(env, regs, caller_saved[i]);
+
+ mark_reg_unknown(env, regs, BPF_REG_0);
+ err = __mark_reg_s32_range(env, regs, BPF_REG_0, -MAX_ERRNO, -1);
+ if (err) {
+ verbose(env, "failed to mark s32 range for retval in forked state for lock\n");
+ return err;
+ }
+ __mark_btf_func_reg_size(env, regs, BPF_REG_0, sizeof(u32));
+ }
+
if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
return -EACCES;
@@ -13245,6 +13648,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (btf_type_is_scalar(t)) {
mark_reg_unknown(env, regs, BPF_REG_0);
+ if (meta.btf == btf_vmlinux && (meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock] ||
+ meta.func_id == special_kfunc_list[KF_bpf_res_spin_lock_irqsave]))
+ __mark_reg_const_zero(env, &regs[BPF_REG_0]);
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
} else if (btf_type_is_ptr(t)) {
ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
@@ -16399,13 +16805,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
const char *exit_ctx = "At program exit";
struct tnum enforce_attach_type_range = tnum_unknown;
const struct bpf_prog *prog = env->prog;
- struct bpf_reg_state *reg;
+ struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_retval_range range = retval_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
struct bpf_func_state *frame = env->cur_state->frame[0];
const bool is_subprog = frame->subprogno;
bool return_32bit = false;
+ const struct btf_type *reg_type, *ret_type = NULL;
/* LSM and struct_ops func-ptr's return type could be "void" */
if (!is_subprog || frame->in_exception_callback_fn) {
@@ -16414,10 +16821,26 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
if (prog->expected_attach_type == BPF_LSM_CGROUP)
/* See below, can be 0 or 0-1 depending on hook. */
break;
- fallthrough;
+ if (!prog->aux->attach_func_proto->type)
+ return 0;
+ break;
case BPF_PROG_TYPE_STRUCT_OPS:
if (!prog->aux->attach_func_proto->type)
return 0;
+
+ if (frame->in_exception_callback_fn)
+ break;
+
+ /* Allow a struct_ops program to return a referenced kptr if it
+ * matches the operator's return type and is in its unmodified
+ * form. A scalar zero (i.e., a null pointer) is also allowed.
+ */
+ reg_type = reg->btf ? btf_type_by_id(reg->btf, reg->btf_id) : NULL;
+ ret_type = btf_type_resolve_ptr(prog->aux->attach_btf,
+ prog->aux->attach_func_proto->type,
+ NULL);
+ if (ret_type && ret_type == reg_type && reg->ref_obj_id)
+ return __check_ptr_off_reg(env, reg, regno, false);
break;
default:
break;
@@ -16439,8 +16862,6 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
return -EACCES;
}
- reg = cur_regs(env) + regno;
-
if (frame->in_async_callback_fn) {
/* enforce return zero from async callbacks like timer */
exit_ctx = "At async callback return";
@@ -16539,6 +16960,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
case BPF_PROG_TYPE_NETFILTER:
range = retval_range(NF_DROP, NF_ACCEPT);
break;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (!ret_type)
+ return 0;
+ range = retval_range(0, 0);
+ break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
* depends on the to-be-replaced kernel func or bpf program.
@@ -16582,6 +17008,14 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off)
subprog->changes_pkt_data = true;
}
+static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off)
+{
+ struct bpf_subprog_info *subprog;
+
+ subprog = find_containing_subprog(env, off);
+ subprog->might_sleep = true;
+}
+
/* 't' is an index of a call-site.
* 'w' is a callee entry point.
* Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED.
@@ -16595,6 +17029,7 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w)
caller = find_containing_subprog(env, t);
callee = find_containing_subprog(env, w);
caller->changes_pkt_data |= callee->changes_pkt_data;
+ caller->might_sleep |= callee->might_sleep;
}
/* non-recursive DFS pseudo code
@@ -16753,27 +17188,6 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
/* Bitmask with 1s for all caller saved registers */
#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
-/* Return a bitmask specifying which caller saved registers are
- * clobbered by a call to a helper *as if* this helper follows
- * bpf_fastcall contract:
- * - includes R0 if function is non-void;
- * - includes R1-R5 if corresponding parameter has is described
- * in the function prototype.
- */
-static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn)
-{
- u32 mask;
- int i;
-
- mask = 0;
- if (fn->ret_type != RET_VOID)
- mask |= BIT(BPF_REG_0);
- for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i)
- if (fn->arg_type[i] != ARG_DONTCARE)
- mask |= BIT(BPF_REG_1 + i);
- return mask;
-}
-
/* True if do_misc_fixups() replaces calls to helper number 'imm',
* replacement patch is presumed to follow bpf_fastcall contract
* (see mark_fastcall_pattern_for_call() below).
@@ -16790,24 +17204,54 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
}
}
-/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */
-static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta)
+struct call_summary {
+ u8 num_params;
+ bool is_void;
+ bool fastcall;
+};
+
+/* If @call is a kfunc or helper call, fills @cs and returns true,
+ * otherwise returns false.
+ */
+static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call,
+ struct call_summary *cs)
{
- u32 vlen, i, mask;
+ struct bpf_kfunc_call_arg_meta meta;
+ const struct bpf_func_proto *fn;
+ int i;
- vlen = btf_type_vlen(meta->func_proto);
- mask = 0;
- if (!btf_type_is_void(btf_type_by_id(meta->btf, meta->func_proto->type)))
- mask |= BIT(BPF_REG_0);
- for (i = 0; i < vlen; ++i)
- mask |= BIT(BPF_REG_1 + i);
- return mask;
-}
+ if (bpf_helper_call(call)) {
-/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */
-static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta)
-{
- return meta->kfunc_flags & KF_FASTCALL;
+ if (get_helper_proto(env, call->imm, &fn) < 0)
+ /* error would be reported later */
+ return false;
+ cs->fastcall = fn->allow_fastcall &&
+ (verifier_inlines_helper_call(env, call->imm) ||
+ bpf_jit_inlines_helper_call(call->imm));
+ cs->is_void = fn->ret_type == RET_VOID;
+ cs->num_params = 0;
+ for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) {
+ if (fn->arg_type[i] == ARG_DONTCARE)
+ break;
+ cs->num_params++;
+ }
+ return true;
+ }
+
+ if (bpf_pseudo_kfunc_call(call)) {
+ int err;
+
+ err = fetch_kfunc_meta(env, call, &meta, NULL);
+ if (err < 0)
+ /* error would be reported later */
+ return false;
+ cs->num_params = btf_type_vlen(meta.func_proto);
+ cs->fastcall = meta.kfunc_flags & KF_FASTCALL;
+ cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type));
+ return true;
+ }
+
+ return false;
}
/* LLVM define a bpf_fastcall function attribute.
@@ -16890,39 +17334,23 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env,
{
struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx;
struct bpf_insn *call = &env->prog->insnsi[insn_idx];
- const struct bpf_func_proto *fn;
- u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS;
+ u32 clobbered_regs_mask;
+ struct call_summary cs;
u32 expected_regs_mask;
- bool can_be_inlined = false;
s16 off;
int i;
- if (bpf_helper_call(call)) {
- if (get_helper_proto(env, call->imm, &fn) < 0)
- /* error would be reported later */
- return;
- clobbered_regs_mask = helper_fastcall_clobber_mask(fn);
- can_be_inlined = fn->allow_fastcall &&
- (verifier_inlines_helper_call(env, call->imm) ||
- bpf_jit_inlines_helper_call(call->imm));
- }
-
- if (bpf_pseudo_kfunc_call(call)) {
- struct bpf_kfunc_call_arg_meta meta;
- int err;
-
- err = fetch_kfunc_meta(env, call, &meta, NULL);
- if (err < 0)
- /* error would be reported later */
- return;
-
- clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta);
- can_be_inlined = is_fastcall_kfunc_call(&meta);
- }
-
- if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS)
+ if (!get_call_summary(env, call, &cs))
return;
+ /* A bitmask specifying which caller saved registers are clobbered
+ * by a call to a helper/kfunc *as if* this helper/kfunc follows
+ * bpf_fastcall contract:
+ * - includes R0 if function is non-void;
+ * - includes R1-R5 if corresponding parameter has is described
+ * in the function prototype.
+ */
+ clobbered_regs_mask = GENMASK(cs.num_params, cs.is_void ? 1 : 0);
/* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */
expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS;
@@ -16980,7 +17408,7 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env,
* don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills()
* does not remove spill/fill pair {4,6}.
*/
- if (can_be_inlined)
+ if (cs.fastcall)
env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1;
else
subprog->keep_fastcall_stack = 1;
@@ -17062,9 +17490,20 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
mark_prune_point(env, t);
mark_jmp_point(env, t);
}
- if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm))
- mark_subprog_changes_pkt_data(env, t);
- if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ if (bpf_helper_call(insn)) {
+ const struct bpf_func_proto *fp;
+
+ ret = get_helper_proto(env, insn->imm, &fp);
+ /* If called in a non-sleepable context program will be
+ * rejected anyway, so we should end up with precise
+ * sleepable marks on subprogs, except for dead code
+ * elimination.
+ */
+ if (ret == 0 && fp->might_sleep)
+ mark_subprog_might_sleep(env, t);
+ if (bpf_helper_changes_pkt_data(insn->imm))
+ mark_subprog_changes_pkt_data(env, t);
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
struct bpf_kfunc_call_arg_meta meta;
ret = fetch_kfunc_meta(env, insn, &meta, NULL);
@@ -17083,6 +17522,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
*/
mark_force_checkpoint(env, t);
}
+ /* Same as helpers, if called in a non-sleepable context
+ * program will be rejected anyway, so we should end up
+ * with precise sleepable marks on subprogs, except for
+ * dead code elimination.
+ */
+ if (ret == 0 && is_kfunc_sleepable(&meta))
+ mark_subprog_might_sleep(env, t);
}
return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
@@ -17125,9 +17571,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
static int check_cfg(struct bpf_verifier_env *env)
{
int insn_cnt = env->prog->len;
- int *insn_stack, *insn_state;
+ int *insn_stack, *insn_state, *insn_postorder;
int ex_insn_beg, i, ret = 0;
- bool ex_done = false;
insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
@@ -17139,6 +17584,17 @@ static int check_cfg(struct bpf_verifier_env *env)
return -ENOMEM;
}
+ insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ if (!insn_postorder) {
+ kvfree(insn_state);
+ kvfree(insn_stack);
+ return -ENOMEM;
+ }
+
+ ex_insn_beg = env->exception_callback_subprog
+ ? env->subprog_info[env->exception_callback_subprog].start
+ : 0;
+
insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
insn_stack[0] = 0; /* 0 is the first instruction */
env->cfg.cur_stack = 1;
@@ -17152,6 +17608,7 @@ walk_cfg:
case DONE_EXPLORING:
insn_state[t] = EXPLORED;
env->cfg.cur_stack--;
+ insn_postorder[env->cfg.cur_postorder++] = t;
break;
case KEEP_EXPLORING:
break;
@@ -17170,13 +17627,10 @@ walk_cfg:
goto err_free;
}
- if (env->exception_callback_subprog && !ex_done) {
- ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start;
-
+ if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) {
insn_state[ex_insn_beg] = DISCOVERED;
insn_stack[0] = ex_insn_beg;
env->cfg.cur_stack = 1;
- ex_done = true;
goto walk_cfg;
}
@@ -17199,6 +17653,7 @@ walk_cfg:
}
ret = 0; /* cfg looks good */
env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data;
+ env->prog->aux->might_sleep = env->subprog_info[0].might_sleep;
err_free:
kvfree(insn_state);
@@ -17815,18 +18270,22 @@ static void clean_verifier_state(struct bpf_verifier_env *env,
static void clean_live_states(struct bpf_verifier_env *env, int insn,
struct bpf_verifier_state *cur)
{
+ struct bpf_verifier_state *loop_entry;
struct bpf_verifier_state_list *sl;
+ struct list_head *pos, *head;
- sl = *explored_state(env, insn);
- while (sl) {
+ head = explored_state(env, insn);
+ list_for_each(pos, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
if (sl->state.branches)
- goto next;
+ continue;
+ loop_entry = get_loop_entry(env, &sl->state);
+ if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches)
+ continue;
if (sl->state.insn_idx != insn ||
!same_callsites(&sl->state, cur))
- goto next;
+ continue;
clean_verifier_state(env, &sl->state);
-next:
- sl = sl->next;
}
}
@@ -18127,7 +18586,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
case STACK_IRQ_FLAG:
old_reg = &old->stack[spi].spilled_ptr;
cur_reg = &cur->stack[spi].spilled_ptr;
- if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap) ||
+ old_reg->irq.kfunc_class != cur_reg->irq.kfunc_class)
return false;
break;
case STACK_MISC:
@@ -18162,6 +18622,10 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c
if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
return false;
+ if (!check_ids(old->active_lock_id, cur->active_lock_id, idmap) ||
+ old->active_lock_ptr != cur->active_lock_ptr)
+ return false;
+
for (i = 0; i < old->acquired_refs; i++) {
if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
old->refs[i].type != cur->refs[i].type)
@@ -18171,6 +18635,8 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c
case REF_TYPE_IRQ:
break;
case REF_TYPE_LOCK:
+ case REF_TYPE_RES_LOCK:
+ case REF_TYPE_RES_LOCK_IRQ:
if (old->refs[i].ptr != cur->refs[i].ptr)
return false;
break;
@@ -18210,15 +18676,17 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c
* the current state will reach 'bpf_exit' instruction safely
*/
static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old,
- struct bpf_func_state *cur, enum exact_level exact)
+ struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact)
{
- int i;
+ u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before;
+ u16 i;
if (old->callback_depth > cur->callback_depth)
return false;
for (i = 0; i < MAX_BPF_REG; i++)
- if (!regsafe(env, &old->regs[i], &cur->regs[i],
+ if (((1 << i) & live_regs) &&
+ !regsafe(env, &old->regs[i], &cur->regs[i],
&env->idmap_scratch, exact))
return false;
@@ -18239,6 +18707,7 @@ static bool states_equal(struct bpf_verifier_env *env,
struct bpf_verifier_state *cur,
enum exact_level exact)
{
+ u32 insn_idx;
int i;
if (old->curframe != cur->curframe)
@@ -18262,9 +18731,12 @@ static bool states_equal(struct bpf_verifier_env *env,
* and all frame states need to be equivalent
*/
for (i = 0; i <= old->curframe; i++) {
+ insn_idx = i == old->curframe
+ ? env->insn_idx
+ : old->frame[i + 1]->callsite;
if (old->frame[i]->callsite != cur->frame[i]->callsite)
return false;
- if (!func_states_equal(env, old->frame[i], cur->frame[i], exact))
+ if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact))
return false;
}
return true;
@@ -18517,10 +18989,11 @@ static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
- struct bpf_verifier_state_list *sl, **pprev;
+ struct bpf_verifier_state_list *sl;
struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry;
int i, j, n, err, states_cnt = 0;
bool force_new_state, add_new_state, force_exact;
+ struct list_head *pos, *tmp, *head;
force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) ||
/* Avoid accumulating infinitely long jmp history */
@@ -18539,15 +19012,14 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
env->insn_processed - env->prev_insn_processed >= 8)
add_new_state = true;
- pprev = explored_state(env, insn_idx);
- sl = *pprev;
-
clean_live_states(env, insn_idx, cur);
- while (sl) {
+ head = explored_state(env, insn_idx);
+ list_for_each_safe(pos, tmp, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
states_cnt++;
if (sl->state.insn_idx != insn_idx)
- goto next;
+ continue;
if (sl->state.branches) {
struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
@@ -18621,7 +19093,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) {
- update_loop_entry(cur, &sl->state);
+ update_loop_entry(env, cur, &sl->state);
goto hit;
}
}
@@ -18630,7 +19102,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
if (is_may_goto_insn_at(env, insn_idx)) {
if (sl->state.may_goto_depth != cur->may_goto_depth &&
states_equal(env, &sl->state, cur, RANGE_WITHIN)) {
- update_loop_entry(cur, &sl->state);
+ update_loop_entry(env, cur, &sl->state);
goto hit;
}
}
@@ -18697,11 +19169,13 @@ skip_inf_loop_check:
*
* Additional details are in the comment before get_loop_entry().
*/
- loop_entry = get_loop_entry(&sl->state);
+ loop_entry = get_loop_entry(env, &sl->state);
+ if (IS_ERR(loop_entry))
+ return PTR_ERR(loop_entry);
force_exact = loop_entry && loop_entry->branches > 0;
if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) {
if (force_exact)
- update_loop_entry(cur, loop_entry);
+ update_loop_entry(env, cur, loop_entry);
hit:
sl->hit_cnt++;
/* reached equivalent register/stack state,
@@ -18750,31 +19224,13 @@ miss:
/* the state is unlikely to be useful. Remove it to
* speed up verification
*/
- *pprev = sl->next;
- if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE &&
- !sl->state.used_as_loop_entry) {
- u32 br = sl->state.branches;
-
- WARN_ONCE(br,
- "BUG live_done but branches_to_explore %d\n",
- br);
- free_verifier_state(&sl->state, false);
- kfree(sl);
- env->peak_states--;
- } else {
- /* cannot free this state, since parentage chain may
- * walk it later. Add it for free_list instead to
- * be freed at the end of verification
- */
- sl->next = env->free_list;
- env->free_list = sl;
- }
- sl = *pprev;
- continue;
+ sl->in_free_list = true;
+ list_del(&sl->node);
+ list_add(&sl->node, &env->free_list);
+ env->free_list_size++;
+ env->explored_states_size--;
+ maybe_free_verifier_state(env, sl);
}
-next:
- pprev = &sl->next;
- sl = *pprev;
}
if (env->max_states_per_insn < states_cnt)
@@ -18799,7 +19255,8 @@ next:
if (!new_sl)
return -ENOMEM;
env->total_states++;
- env->peak_states++;
+ env->explored_states_size++;
+ update_peak_states(env);
env->prev_jmps_processed = env->jmps_processed;
env->prev_insn_processed = env->insn_processed;
@@ -18823,8 +19280,8 @@ next:
cur->first_insn_idx = insn_idx;
cur->insn_hist_start = cur->insn_hist_end;
cur->dfs_depth = new->dfs_depth + 1;
- new_sl->next = *explored_state(env, insn_idx);
- *explored_state(env, insn_idx) = new_sl;
+ list_add(&new_sl->node, head);
+
/* connect new state to parentage chain. Current frame needs all
* registers connected. Only r6 - r9 of the callers are alive (pushed
* to the stack implicitly by JITs) so in callers' frames connect just
@@ -19011,19 +19468,13 @@ static int do_check(struct bpf_verifier_env *env)
}
if (env->log.level & BPF_LOG_LEVEL) {
- const struct bpf_insn_cbs cbs = {
- .cb_call = disasm_kfunc_name,
- .cb_print = verbose,
- .private_data = env,
- };
-
if (verifier_state_scratched(env))
print_insn_state(env, state, state->curframe);
verbose_linfo(env, env->insn_idx, "; ");
env->prev_log_pos = env->log.end_pos;
verbose(env, "%d: ", env->insn_idx);
- print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+ verbose_insn(env, insn);
env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
env->prev_log_pos = env->log.end_pos;
}
@@ -19045,37 +19496,18 @@ static int do_check(struct bpf_verifier_env *env)
return err;
} else if (class == BPF_LDX) {
- enum bpf_reg_type src_reg_type;
+ bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX;
- /* check for reserved fields is already done */
-
- /* check src operand */
- err = check_reg_arg(env, insn->src_reg, SRC_OP);
- if (err)
- return err;
-
- err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK);
- if (err)
- return err;
-
- src_reg_type = regs[insn->src_reg].type;
-
- /* check that memory (src_reg + off) is readable,
- * the state of dst_reg will be updated by this func
+ /* Check for reserved fields is already done in
+ * resolve_pseudo_ldimm64().
*/
- err = check_mem_access(env, env->insn_idx, insn->src_reg,
- insn->off, BPF_SIZE(insn->code),
- BPF_READ, insn->dst_reg, false,
- BPF_MODE(insn->code) == BPF_MEMSX);
- err = err ?: save_aux_ptr_type(env, src_reg_type, true);
- err = err ?: reg_bounds_sanity_check(env, &regs[insn->dst_reg], "ldx");
+ err = check_load_mem(env, insn, false, is_ldsx, true,
+ "ldx");
if (err)
return err;
} else if (class == BPF_STX) {
- enum bpf_reg_type dst_reg_type;
-
if (BPF_MODE(insn->code) == BPF_ATOMIC) {
- err = check_atomic(env, env->insn_idx, insn);
+ err = check_atomic(env, insn);
if (err)
return err;
env->insn_idx++;
@@ -19087,25 +19519,7 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- /* check src1 operand */
- err = check_reg_arg(env, insn->src_reg, SRC_OP);
- if (err)
- return err;
- /* check src2 operand */
- err = check_reg_arg(env, insn->dst_reg, SRC_OP);
- if (err)
- return err;
-
- dst_reg_type = regs[insn->dst_reg].type;
-
- /* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, env->insn_idx, insn->dst_reg,
- insn->off, BPF_SIZE(insn->code),
- BPF_WRITE, insn->src_reg, false, false);
- if (err)
- return err;
-
- err = save_aux_ptr_type(env, dst_reg_type, false);
+ err = check_store_reg(env, insn, false);
if (err)
return err;
} else if (class == BPF_ST) {
@@ -19245,6 +19659,10 @@ process_bpf_exit:
return err;
break;
} else {
+ if (WARN_ON_ONCE(env->cur_state->loop_entry)) {
+ verbose(env, "verifier bug: env->cur_state->loop_entry != NULL\n");
+ return -EFAULT;
+ }
do_print_state = true;
continue;
}
@@ -19504,7 +19922,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
- if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
+ if (btf_record_has_field(map->record, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK)) {
if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
@@ -20334,7 +20752,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
struct bpf_subprog_info *subprogs = env->subprog_info;
const struct bpf_verifier_ops *ops = env->ops;
- int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0;
+ int i, cnt, size, ctx_field_size, ret, delta = 0, epilogue_cnt = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn *epilogue_buf = env->epilogue_buf;
struct bpf_insn *insn_buf = env->insn_buf;
@@ -20363,6 +20781,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
return -ENOMEM;
env->prog = new_prog;
delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, epilogue_buf, epilogue_cnt - 1);
+ if (ret < 0)
+ return ret;
}
}
@@ -20383,6 +20805,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
env->prog = new_prog;
delta += cnt - 1;
+
+ ret = add_kfunc_in_insns(env, insn_buf, cnt - 1);
+ if (ret < 0)
+ return ret;
}
}
@@ -20415,7 +20841,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
- } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
+ } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) ||
+ insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
@@ -20723,6 +21151,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb;
func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data;
+ func[i]->aux->might_sleep = env->subprog_info[i].might_sleep;
if (!i)
func[i]->aux->exception_boundary = env->seen_exception;
func[i] = bpf_int_jit_compile(func[i]);
@@ -20939,6 +21368,14 @@ static void specialize_kfunc(struct bpf_verifier_env *env,
*/
env->seen_direct_write = seen_direct_write;
}
+
+ if (func_id == special_kfunc_list[KF_bpf_set_dentry_xattr] &&
+ bpf_lsm_has_d_inode_locked(prog))
+ *addr = (unsigned long)bpf_set_dentry_xattr_locked;
+
+ if (func_id == special_kfunc_list[KF_bpf_remove_dentry_xattr] &&
+ bpf_lsm_has_d_inode_locked(prog))
+ *addr = (unsigned long)bpf_remove_dentry_xattr_locked;
}
static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
@@ -21373,7 +21810,50 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto next_insn;
}
- if (is_may_goto_insn(insn)) {
+ if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) {
+ int stack_off_cnt = -stack_depth - 16;
+
+ /*
+ * Two 8 byte slots, depth-16 stores the count, and
+ * depth-8 stores the start timestamp of the loop.
+ *
+ * The starting value of count is BPF_MAX_TIMED_LOOPS
+ * (0xffff). Every iteration loads it and subs it by 1,
+ * until the value becomes 0 in AX (thus, 1 in stack),
+ * after which we call arch_bpf_timed_may_goto, which
+ * either sets AX to 0xffff to keep looping, or to 0
+ * upon timeout. AX is then stored into the stack. In
+ * the next iteration, we either see 0 and break out, or
+ * continue iterating until the next time value is 0
+ * after subtraction, rinse and repeat.
+ */
+ stack_depth_extra = 16;
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt);
+ if (insn->off >= 0)
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5);
+ else
+ insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1);
+ insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2);
+ /*
+ * AX is used as an argument to pass in stack_off_cnt
+ * (to add to r10/fp), and also as the return value of
+ * the call to arch_bpf_timed_may_goto.
+ */
+ insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt);
+ insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto);
+ insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt);
+ cnt = 7;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto next_insn;
+ } else if (is_may_goto_insn(insn)) {
int stack_off = -stack_depth - 8;
stack_depth_extra = 8;
@@ -21702,12 +22182,12 @@ patch_map_ops_generic:
if (insn->imm == BPF_FUNC_get_smp_processor_id &&
verifier_inlines_helper_call(env, insn->imm)) {
/* BPF_FUNC_get_smp_processor_id inlining is an
- * optimization, so if pcpu_hot.cpu_number is ever
+ * optimization, so if cpu_number is ever
* changed in some incompatible and hard to support
* way, it's fine to back out this inlining logic
*/
#ifdef CONFIG_SMP
- insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number);
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number);
insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
cnt = 3;
@@ -21897,6 +22377,13 @@ next_insn:
if (subprogs[cur_subprog + 1].start == i + delta + 1) {
subprogs[cur_subprog].stack_depth += stack_depth_extra;
subprogs[cur_subprog].stack_extra = stack_depth_extra;
+
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ if (stack_depth > MAX_BPF_STACK && !prog->jit_requested) {
+ verbose(env, "stack size %d(extra %d) is too large\n",
+ stack_depth, stack_depth_extra);
+ return -EINVAL;
+ }
cur_subprog++;
stack_depth = subprogs[cur_subprog].stack_depth;
stack_depth_extra = 0;
@@ -21907,23 +22394,33 @@ next_insn:
env->prog->aux->stack_depth = subprogs[0].stack_depth;
for (i = 0; i < env->subprog_cnt; i++) {
+ int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
int subprog_start = subprogs[i].start;
int stack_slots = subprogs[i].stack_extra / 8;
+ int slots = delta, cnt = 0;
if (!stack_slots)
continue;
- if (stack_slots > 1) {
+ /* We need two slots in case timed may_goto is supported. */
+ if (stack_slots > slots) {
verbose(env, "verifier bug: stack_slots supports may_goto only\n");
return -EFAULT;
}
- /* Add ST insn to subprog prologue to init extra stack */
- insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP,
- -subprogs[i].stack_depth, BPF_MAX_LOOPS);
+ stack_depth = subprogs[i].stack_depth;
+ if (bpf_jit_supports_timed_may_goto()) {
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_TIMED_LOOPS);
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0);
+ } else {
+ /* Add ST insn to subprog prologue to init extra stack */
+ insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth,
+ BPF_MAX_LOOPS);
+ }
/* Copy first actual insn to preserve it */
- insn_buf[1] = env->prog->insnsi[subprog_start];
+ insn_buf[cnt++] = env->prog->insnsi[subprog_start];
- new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2);
+ new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
env->prog = prog = new_prog;
@@ -21933,7 +22430,7 @@ next_insn:
* to insn after BPF_ST that inits may_goto count.
* Adjustment will succeed because bpf_patch_insn_data() didn't fail.
*/
- WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1));
+ WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta));
}
/* Since poke tab is now finalized, publish aux to tracker. */
@@ -22131,31 +22628,29 @@ static int remove_fastcall_spills_fills(struct bpf_verifier_env *env)
static void free_states(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state_list *sl, *sln;
+ struct bpf_verifier_state_list *sl;
+ struct list_head *head, *pos, *tmp;
int i;
- sl = env->free_list;
- while (sl) {
- sln = sl->next;
+ list_for_each_safe(pos, tmp, &env->free_list) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
free_verifier_state(&sl->state, false);
kfree(sl);
- sl = sln;
}
- env->free_list = NULL;
+ INIT_LIST_HEAD(&env->free_list);
if (!env->explored_states)
return;
for (i = 0; i < state_htab_size(env); i++) {
- sl = env->explored_states[i];
+ head = &env->explored_states[i];
- while (sl) {
- sln = sl->next;
+ list_for_each_safe(pos, tmp, head) {
+ sl = container_of(pos, struct bpf_verifier_state_list, node);
free_verifier_state(&sl->state, false);
kfree(sl);
- sl = sln;
}
- env->explored_states[i] = NULL;
+ INIT_LIST_HEAD(&env->explored_states[i]);
}
}
@@ -22163,6 +22658,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
struct bpf_subprog_info *sub = subprog_info(env, subprog);
+ struct bpf_prog_aux *aux = env->prog->aux;
struct bpf_verifier_state *state;
struct bpf_reg_state *regs;
int ret, i;
@@ -22270,6 +22766,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
mark_reg_known_zero(env, regs, BPF_REG_1);
}
+ /* Acquire references for struct_ops program arguments tagged with "__ref" */
+ if (!subprog && env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
+ for (i = 0; i < aux->ctx_arg_info_size; i++)
+ aux->ctx_arg_info[i].ref_obj_id = aux->ctx_arg_info[i].refcounted ?
+ acquire_reference(env, 0) : 0;
+ }
+
ret = do_check(env);
out:
/* check for NULL is necessary, since cur_state can be freed inside
@@ -22392,6 +22895,15 @@ static void print_verification_stats(struct bpf_verifier_env *env)
env->peak_states, env->longest_mark_read_walk);
}
+int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog,
+ const struct bpf_ctx_arg_aux *info, u32 cnt)
+{
+ prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL);
+ prog->aux->ctx_arg_info_size = cnt;
+
+ return prog->aux->ctx_arg_info ? 0 : -ENOMEM;
+}
+
static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
{
const struct btf_type *t, *func_proto;
@@ -22399,10 +22911,11 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
const struct bpf_struct_ops *st_ops;
const struct btf_member *member;
struct bpf_prog *prog = env->prog;
- u32 btf_id, member_idx;
+ bool has_refcounted_arg = false;
+ u32 btf_id, member_idx, member_off;
struct btf *btf;
const char *mname;
- int err;
+ int i, err;
if (!prog->gpl_compatible) {
verbose(env, "struct ops programs must have a GPL compatible license\n");
@@ -22450,7 +22963,8 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
return -EINVAL;
}
- err = bpf_struct_ops_supported(st_ops, __btf_member_bit_offset(t, member) / 8);
+ member_off = __btf_member_bit_offset(t, member) / 8;
+ err = bpf_struct_ops_supported(st_ops, member_off);
if (err) {
verbose(env, "attach to unsupported member %s of struct %s\n",
mname, st_ops->name);
@@ -22472,17 +22986,32 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
return -EACCES;
}
- /* btf_ctx_access() used this to provide argument type info */
- prog->aux->ctx_arg_info =
- st_ops_desc->arg_info[member_idx].info;
- prog->aux->ctx_arg_info_size =
- st_ops_desc->arg_info[member_idx].cnt;
+ for (i = 0; i < st_ops_desc->arg_info[member_idx].cnt; i++) {
+ if (st_ops_desc->arg_info[member_idx].info->refcounted) {
+ has_refcounted_arg = true;
+ break;
+ }
+ }
+
+ /* Tail call is not allowed for programs with refcounted arguments since we
+ * cannot guarantee that valid refcounted kptrs will be passed to the callee.
+ */
+ for (i = 0; i < env->subprog_cnt; i++) {
+ if (has_refcounted_arg && env->subprog_info[i].has_tail_call) {
+ verbose(env, "program with __ref argument cannot tail call\n");
+ return -EINVAL;
+ }
+ }
+
+ prog->aux->st_ops = st_ops;
+ prog->aux->attach_st_ops_member_off = member_off;
prog->aux->attach_func_proto = func_proto;
prog->aux->attach_func_name = mname;
env->ops = st_ops->verifier_ops;
- return 0;
+ return bpf_prog_ctx_arg_info_init(prog, st_ops_desc->arg_info[member_idx].info,
+ st_ops_desc->arg_info[member_idx].cnt);
}
#define SECURITY_PREFIX "security_"
@@ -22558,6 +23087,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
if (tgt_prog) {
struct bpf_prog_aux *aux = tgt_prog->aux;
bool tgt_changes_pkt_data;
+ bool tgt_might_sleep;
if (bpf_prog_is_dev_bound(prog->aux) &&
!bpf_prog_dev_bound_match(prog, tgt_prog)) {
@@ -22600,6 +23130,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
"Extension program changes packet data, while original does not\n");
return -EINVAL;
}
+
+ tgt_might_sleep = aux->func
+ ? aux->func[subprog]->aux->might_sleep
+ : aux->might_sleep;
+ if (prog->aux->might_sleep && !tgt_might_sleep) {
+ bpf_log(log,
+ "Extension program may sleep, while original does not\n");
+ return -EINVAL;
+ }
}
if (!tgt_prog->jited) {
bpf_log(log, "Can attach to only JITed progs\n");
@@ -22856,6 +23395,33 @@ BTF_ID(func, __rcu_read_unlock)
#endif
BTF_SET_END(btf_id_deny)
+/* fexit and fmod_ret can't be used to attach to __noreturn functions.
+ * Currently, we must manually list all __noreturn functions here. Once a more
+ * robust solution is implemented, this workaround can be removed.
+ */
+BTF_SET_START(noreturn_deny)
+#ifdef CONFIG_IA32_EMULATION
+BTF_ID(func, __ia32_sys_exit)
+BTF_ID(func, __ia32_sys_exit_group)
+#endif
+#ifdef CONFIG_KUNIT
+BTF_ID(func, __kunit_abort)
+BTF_ID(func, kunit_try_catch_throw)
+#endif
+#ifdef CONFIG_MODULES
+BTF_ID(func, __module_put_and_kthread_exit)
+#endif
+#ifdef CONFIG_X86_64
+BTF_ID(func, __x64_sys_exit)
+BTF_ID(func, __x64_sys_exit_group)
+#endif
+BTF_ID(func, do_exit)
+BTF_ID(func, do_group_exit)
+BTF_ID(func, kthread_complete_and_exit)
+BTF_ID(func, kthread_exit)
+BTF_ID(func, make_task_dead)
+BTF_SET_END(noreturn_deny)
+
static bool can_be_sleepable(struct bpf_prog *prog)
{
if (prog->type == BPF_PROG_TYPE_TRACING) {
@@ -22932,9 +23498,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
prog->aux->attach_btf_trace = true;
return 0;
} else if (prog->expected_attach_type == BPF_TRACE_ITER) {
- if (!bpf_iter_prog_supported(prog))
- return -EINVAL;
- return 0;
+ return bpf_iter_prog_supported(prog);
}
if (prog->type == BPF_PROG_TYPE_LSM) {
@@ -22944,6 +23508,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
} else if (prog->type == BPF_PROG_TYPE_TRACING &&
btf_id_set_contains(&btf_id_deny, btf_id)) {
return -EINVAL;
+ } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT ||
+ prog->expected_attach_type == BPF_MODIFY_RETURN) &&
+ btf_id_set_contains(&noreturn_deny, btf_id)) {
+ verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n");
+ return -EINVAL;
}
key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
@@ -23036,6 +23605,302 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr,
return 0;
}
+static bool can_fallthrough(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+
+ if (class != BPF_JMP && class != BPF_JMP32)
+ return true;
+
+ if (opcode == BPF_EXIT || opcode == BPF_JA)
+ return false;
+
+ return true;
+}
+
+static bool can_jump(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+
+ if (class != BPF_JMP && class != BPF_JMP32)
+ return false;
+
+ switch (opcode) {
+ case BPF_JA:
+ case BPF_JEQ:
+ case BPF_JNE:
+ case BPF_JLT:
+ case BPF_JLE:
+ case BPF_JGT:
+ case BPF_JGE:
+ case BPF_JSGT:
+ case BPF_JSGE:
+ case BPF_JSLT:
+ case BPF_JSLE:
+ case BPF_JCOND:
+ return true;
+ }
+
+ return false;
+}
+
+static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2])
+{
+ struct bpf_insn *insn = &prog->insnsi[idx];
+ int i = 0, insn_sz;
+ u32 dst;
+
+ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
+ if (can_fallthrough(insn) && idx + 1 < prog->len)
+ succ[i++] = idx + insn_sz;
+
+ if (can_jump(insn)) {
+ dst = idx + jmp_offset(insn) + 1;
+ if (i == 0 || succ[0] != dst)
+ succ[i++] = dst;
+ }
+
+ return i;
+}
+
+/* Each field is a register bitmask */
+struct insn_live_regs {
+ u16 use; /* registers read by instruction */
+ u16 def; /* registers written by instruction */
+ u16 in; /* registers that may be alive before instruction */
+ u16 out; /* registers that may be alive after instruction */
+};
+
+/* Bitmask with 1s for all caller saved registers */
+#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1)
+
+/* Compute info->{use,def} fields for the instruction */
+static void compute_insn_live_regs(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct insn_live_regs *info)
+{
+ struct call_summary cs;
+ u8 class = BPF_CLASS(insn->code);
+ u8 code = BPF_OP(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u16 src = BIT(insn->src_reg);
+ u16 dst = BIT(insn->dst_reg);
+ u16 r0 = BIT(0);
+ u16 def = 0;
+ u16 use = 0xffff;
+
+ switch (class) {
+ case BPF_LD:
+ switch (mode) {
+ case BPF_IMM:
+ if (BPF_SIZE(insn->code) == BPF_DW) {
+ def = dst;
+ use = 0;
+ }
+ break;
+ case BPF_LD | BPF_ABS:
+ case BPF_LD | BPF_IND:
+ /* stick with defaults */
+ break;
+ }
+ break;
+ case BPF_LDX:
+ switch (mode) {
+ case BPF_MEM:
+ case BPF_MEMSX:
+ def = dst;
+ use = src;
+ break;
+ }
+ break;
+ case BPF_ST:
+ switch (mode) {
+ case BPF_MEM:
+ def = 0;
+ use = dst;
+ break;
+ }
+ break;
+ case BPF_STX:
+ switch (mode) {
+ case BPF_MEM:
+ def = 0;
+ use = dst | src;
+ break;
+ case BPF_ATOMIC:
+ switch (insn->imm) {
+ case BPF_CMPXCHG:
+ use = r0 | dst | src;
+ def = r0;
+ break;
+ case BPF_LOAD_ACQ:
+ def = dst;
+ use = src;
+ break;
+ case BPF_STORE_REL:
+ def = 0;
+ use = dst | src;
+ break;
+ default:
+ use = dst | src;
+ if (insn->imm & BPF_FETCH)
+ def = src;
+ else
+ def = 0;
+ }
+ break;
+ }
+ break;
+ case BPF_ALU:
+ case BPF_ALU64:
+ switch (code) {
+ case BPF_END:
+ use = dst;
+ def = dst;
+ break;
+ case BPF_MOV:
+ def = dst;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = 0;
+ else
+ use = src;
+ break;
+ default:
+ def = dst;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = dst;
+ else
+ use = dst | src;
+ }
+ break;
+ case BPF_JMP:
+ case BPF_JMP32:
+ switch (code) {
+ case BPF_JA:
+ case BPF_JCOND:
+ def = 0;
+ use = 0;
+ break;
+ case BPF_EXIT:
+ def = 0;
+ use = r0;
+ break;
+ case BPF_CALL:
+ def = ALL_CALLER_SAVED_REGS;
+ use = def & ~BIT(BPF_REG_0);
+ if (get_call_summary(env, insn, &cs))
+ use = GENMASK(cs.num_params, 1);
+ break;
+ default:
+ def = 0;
+ if (BPF_SRC(insn->code) == BPF_K)
+ use = dst;
+ else
+ use = dst | src;
+ }
+ break;
+ }
+
+ info->def = def;
+ info->use = use;
+}
+
+/* Compute may-live registers after each instruction in the program.
+ * The register is live after the instruction I if it is read by some
+ * instruction S following I during program execution and is not
+ * overwritten between I and S.
+ *
+ * Store result in env->insn_aux_data[i].live_regs.
+ */
+static int compute_live_registers(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *insn_aux = env->insn_aux_data;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct insn_live_regs *state;
+ int insn_cnt = env->prog->len;
+ int err = 0, i, j;
+ bool changed;
+
+ /* Use the following algorithm:
+ * - define the following:
+ * - I.use : a set of all registers read by instruction I;
+ * - I.def : a set of all registers written by instruction I;
+ * - I.in : a set of all registers that may be alive before I execution;
+ * - I.out : a set of all registers that may be alive after I execution;
+ * - insn_successors(I): a set of instructions S that might immediately
+ * follow I for some program execution;
+ * - associate separate empty sets 'I.in' and 'I.out' with each instruction;
+ * - visit each instruction in a postorder and update
+ * state[i].in, state[i].out as follows:
+ *
+ * state[i].out = U [state[s].in for S in insn_successors(i)]
+ * state[i].in = (state[i].out / state[i].def) U state[i].use
+ *
+ * (where U stands for set union, / stands for set difference)
+ * - repeat the computation while {in,out} fields changes for
+ * any instruction.
+ */
+ state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL);
+ if (!state) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < insn_cnt; ++i)
+ compute_insn_live_regs(env, &insns[i], &state[i]);
+
+ changed = true;
+ while (changed) {
+ changed = false;
+ for (i = 0; i < env->cfg.cur_postorder; ++i) {
+ int insn_idx = env->cfg.insn_postorder[i];
+ struct insn_live_regs *live = &state[insn_idx];
+ int succ_num;
+ u32 succ[2];
+ u16 new_out = 0;
+ u16 new_in = 0;
+
+ succ_num = insn_successors(env->prog, insn_idx, succ);
+ for (int s = 0; s < succ_num; ++s)
+ new_out |= state[succ[s]].in;
+ new_in = (new_out & ~live->def) | live->use;
+ if (new_out != live->out || new_in != live->in) {
+ live->in = new_in;
+ live->out = new_out;
+ changed = true;
+ }
+ }
+ }
+
+ for (i = 0; i < insn_cnt; ++i)
+ insn_aux[i].live_regs_before = state[i].in;
+
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "Live regs before insn:\n");
+ for (i = 0; i < insn_cnt; ++i) {
+ verbose(env, "%3d: ", i);
+ for (j = BPF_REG_0; j < BPF_REG_10; ++j)
+ if (insn_aux[i].live_regs_before & BIT(j))
+ verbose(env, "%d", j);
+ else
+ verbose(env, ".");
+ verbose(env, " ");
+ verbose_insn(env, &insns[i]);
+ if (bpf_is_ldimm64(&insns[i]))
+ i++;
+ }
+ }
+
+out:
+ kvfree(state);
+ kvfree(env->cfg.insn_postorder);
+ env->cfg.insn_postorder = NULL;
+ env->cfg.cur_postorder = 0;
+ return err;
+}
+
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
u64 start_time = ktime_get_ns();
@@ -23113,12 +23978,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
env->test_reg_invariants = attr->prog_flags & BPF_F_TEST_REG_INVARIANTS;
env->explored_states = kvcalloc(state_htab_size(env),
- sizeof(struct bpf_verifier_state_list *),
+ sizeof(struct list_head),
GFP_USER);
ret = -ENOMEM;
if (!env->explored_states)
goto skip_full_check;
+ for (i = 0; i < state_htab_size(env); i++)
+ INIT_LIST_HEAD(&env->explored_states[i]);
+ INIT_LIST_HEAD(&env->free_list);
+
ret = check_btf_info_early(env, attr, uattr);
if (ret < 0)
goto skip_full_check;
@@ -23153,6 +24022,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret)
goto skip_full_check;
+ ret = compute_live_registers(env);
+ if (ret < 0)
+ goto skip_full_check;
+
ret = mark_fastcall_patterns(env);
if (ret < 0)
goto skip_full_check;
@@ -23291,6 +24164,7 @@ err_unlock:
vfree(env->insn_aux_data);
kvfree(env->insn_hist);
err_free_env:
+ kvfree(env->cfg.insn_postorder);
kvfree(env);
return ret;
}
diff --git a/kernel/capability.c b/kernel/capability.c
index e089d2628c29..829f49ae07b9 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -286,22 +286,6 @@ bool has_ns_capability(struct task_struct *t,
}
/**
- * has_capability - Does a task have a capability in init_user_ns
- * @t: The task in question
- * @cap: The capability to be tested for
- *
- * Return true if the specified task has the given superior capability
- * currently in effect to the initial user namespace, false if not.
- *
- * Note that this does not set PF_SUPERPRIV on the task.
- */
-bool has_capability(struct task_struct *t, int cap)
-{
- return has_ns_capability(t, &init_user_ns, cap);
-}
-EXPORT_SYMBOL(has_capability);
-
-/**
* has_ns_capability_noaudit - Does a task have a capability (unaudited)
* in a specific user ns.
* @t: The task in question
diff --git a/kernel/cfi.c b/kernel/cfi.c
index 08caad776717..422fa4f958ae 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -7,6 +7,8 @@
#include <linux/cfi.h>
+bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE);
+
enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
unsigned long *target, u32 type)
{
@@ -17,7 +19,7 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
pr_err("CFI failure at %pS (no target information)\n",
(void *)addr);
- if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) {
+ if (cfi_warn) {
__warn(NULL, 0, (void *)addr, 0, regs, NULL);
return BUG_TRAP_TYPE_WARN;
}
@@ -71,14 +73,11 @@ static bool is_module_cfi_trap(unsigned long addr)
struct module *mod;
bool found = false;
- rcu_read_lock_sched_notrace();
-
+ guard(rcu)();
mod = __module_address(addr);
if (mod)
found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end);
- rcu_read_unlock_sched_notrace();
-
return found;
}
#else /* CONFIG_MODULES */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index c964dd7ff967..95ab39e1ec8f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -168,6 +168,7 @@ struct cgroup_mgctx {
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
+extern bool cgrp_dfl_visible;
/* iterate across the hierarchies */
#define for_each_root(root) \
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index e28d5f0d20ed..11ea8d24ac72 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -673,6 +673,7 @@ struct cftype cgroup1_base_files[] = {
int proc_cgroupstats_show(struct seq_file *m, void *v)
{
struct cgroup_subsys *ss;
+ bool cgrp_v1_visible = false;
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -684,12 +685,18 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
for_each_subsys(ss, i) {
if (cgroup1_subsys_absent(ss))
continue;
+ cgrp_v1_visible |= ss->root != &cgrp_dfl_root;
+
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
}
+ if (cgrp_dfl_visible && !cgrp_v1_visible)
+ pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n");
+
+
return 0;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index afc665b7b1fe..f231fe3a0744 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
* The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility.
*/
-static bool cgrp_dfl_visible;
+bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;
@@ -4447,7 +4447,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
-static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
@@ -5831,7 +5831,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
}
/*
- * This extra ref will be put in cgroup_free_fn() and guarantees
+ * This extra ref will be put in css_free_rwork_fn() and guarantees
* that @cgrp->kn is always accessible.
*/
kernfs_get(cgrp->kn);
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 25c1d7b77e2f..b69a7db67090 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include "cgroup-internal.h"
#include "cpuset-internal.h"
/*
@@ -175,6 +176,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = update_relax_domain_level(cs, val);
break;
default:
@@ -373,6 +375,46 @@ out:
return ret;
}
+#ifdef CONFIG_PROC_PID_CPUSET
+/*
+ * proc_cpuset_show()
+ * - Print tasks cpuset path into seq_file.
+ * - Used for /proc/<pid>/cpuset.
+ */
+int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ char *buf;
+ struct cgroup_subsys_state *css;
+ int retval;
+
+ retval = -ENOMEM;
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ rcu_read_lock();
+ spin_lock_irq(&css_set_lock);
+ css = task_css(tsk, cpuset_cgrp_id);
+ retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ spin_unlock_irq(&css_set_lock);
+ rcu_read_unlock();
+
+ if (retval == -E2BIG)
+ retval = -ENAMETOOLONG;
+ if (retval < 0)
+ goto out_free;
+ seq_puts(m, buf);
+ seq_putc(m, '\n');
+ retval = 0;
+out_free:
+ kfree(buf);
+out:
+ return retval;
+}
+#endif /* CONFIG_PROC_PID_CPUSET */
+
static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct cpuset *cs = css_cs(css);
@@ -424,24 +466,31 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
break;
case FILE_MEM_EXCLUSIVE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
break;
case FILE_MEM_HARDWALL:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
break;
case FILE_SCHED_LOAD_BALANCE:
+ pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name);
retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
break;
case FILE_MEMORY_MIGRATE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
break;
case FILE_MEMORY_PRESSURE_ENABLED:
+ pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name);
cpuset_memory_pressure_enabled = !!val;
break;
case FILE_SPREAD_PAGE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
break;
case FILE_SPREAD_SLAB:
+ pr_warn_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
break;
default:
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f910c828973..39c1fc643d77 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -21,7 +21,6 @@
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
-#include "cgroup-internal.h"
#include "cpuset-internal.h"
#include <linux/init.h>
@@ -954,10 +953,12 @@ static void dl_update_tasks_root_domain(struct cpuset *cs)
css_task_iter_end(&it);
}
-static void dl_rebuild_rd_accounting(void)
+void dl_rebuild_rd_accounting(void)
{
struct cpuset *cs = NULL;
struct cgroup_subsys_state *pos_css;
+ int cpu;
+ u64 cookie = ++dl_cookie;
lockdep_assert_held(&cpuset_mutex);
lockdep_assert_cpus_held();
@@ -965,11 +966,12 @@ static void dl_rebuild_rd_accounting(void)
rcu_read_lock();
- /*
- * Clear default root domain DL accounting, it will be computed again
- * if a task belongs to it.
- */
- dl_clear_root_domain(&def_root_domain);
+ for_each_possible_cpu(cpu) {
+ if (dl_bw_visited(cpu, cookie))
+ continue;
+
+ dl_clear_root_domain_cpu(cpu);
+ }
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
@@ -990,16 +992,6 @@ static void dl_rebuild_rd_accounting(void)
rcu_read_unlock();
}
-static void
-partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
-{
- mutex_lock(&sched_domains_mutex);
- partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
- dl_rebuild_rd_accounting();
- mutex_unlock(&sched_domains_mutex);
-}
-
/*
* Rebuild scheduler domains.
*
@@ -1061,7 +1053,7 @@ void rebuild_sched_domains_locked(void)
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
- partition_and_rebuild_sched_domains(ndoms, doms, attr);
+ partition_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
void rebuild_sched_domains_locked(void)
@@ -1083,6 +1075,13 @@ void rebuild_sched_domains(void)
cpus_read_unlock();
}
+void cpuset_reset_sched_domains(void)
+{
+ mutex_lock(&cpuset_mutex);
+ partition_sched_domains(1, NULL, NULL);
+ mutex_unlock(&cpuset_mutex);
+}
+
/**
* cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -4244,50 +4243,6 @@ void cpuset_print_current_mems_allowed(void)
rcu_read_unlock();
}
-#ifdef CONFIG_PROC_PID_CPUSET
-/*
- * proc_cpuset_show()
- * - Print tasks cpuset path into seq_file.
- * - Used for /proc/<pid>/cpuset.
- * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
- * doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_mutex, keeping cpuset_attach() from changing it
- * anyway.
- */
-int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *tsk)
-{
- char *buf;
- struct cgroup_subsys_state *css;
- int retval;
-
- retval = -ENOMEM;
- buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!buf)
- goto out;
-
- rcu_read_lock();
- spin_lock_irq(&css_set_lock);
- css = task_css(tsk, cpuset_cgrp_id);
- retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
- current->nsproxy->cgroup_ns);
- spin_unlock_irq(&css_set_lock);
- rcu_read_unlock();
-
- if (retval == -E2BIG)
- retval = -ENAMETOOLONG;
- if (retval < 0)
- goto out_free;
- seq_puts(m, buf);
- seq_putc(m, '\n');
- retval = 0;
-out_free:
- kfree(buf);
-out:
- return retval;
-}
-#endif /* CONFIG_PROC_PID_CPUSET */
-
/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 074653f964c1..039d1eb2f215 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -430,9 +430,11 @@ static ssize_t freezer_write(struct kernfs_open_file *of,
if (strcmp(buf, freezer_state_strs(0)) == 0)
freeze = false;
- else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) {
+ pr_info_once("Freezing with imperfect legacy cgroup freezer. "
+ "See cgroup.freeze of cgroup v2\n");
freeze = true;
- else
+ } else
return -EINVAL;
freezer_change_state(css_freezer(of_css(of)), freeze);
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 0e26068995a6..2fa3a4fb2aaf 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -68,22 +68,6 @@ static inline bool valid_type(enum misc_res_type type)
}
/**
- * misc_cg_res_total_usage() - Get the current total usage of the resource.
- * @type: misc res type.
- *
- * Context: Any context.
- * Return: Current total usage of the resource.
- */
-u64 misc_cg_res_total_usage(enum misc_res_type type)
-{
- if (valid_type(type))
- return atomic64_read(&root_cg.res[type].usage);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
-
-/**
* misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
* @type: Type of the misc res.
* @capacity: Supported capacity of the misc res on the host.
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index aac91466279f..4bb587d5d34f 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -299,40 +299,6 @@ static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
spin_unlock_irq(&cgroup_rstat_lock);
}
-/* see cgroup_rstat_flush() */
-static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
- __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
-{
- int cpu;
-
- lockdep_assert_held(&cgroup_rstat_lock);
-
- for_each_possible_cpu(cpu) {
- struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
-
- for (; pos; pos = pos->rstat_flush_next) {
- struct cgroup_subsys_state *css;
-
- cgroup_base_stat_flush(pos, cpu);
- bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
-
- rcu_read_lock();
- list_for_each_entry_rcu(css, &pos->rstat_css_list,
- rstat_css_node)
- css->ss->css_rstat_flush(css, cpu);
- rcu_read_unlock();
- }
-
- /* play nice and yield if necessary */
- if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
- __cgroup_rstat_unlock(cgrp, cpu);
- if (!cond_resched())
- cpu_relax();
- __cgroup_rstat_lock(cgrp, cpu);
- }
- }
-}
-
/**
* cgroup_rstat_flush - flush stats in @cgrp's subtree
* @cgrp: target cgroup
@@ -348,38 +314,30 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
*/
__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
+ int cpu;
+
might_sleep();
+ for_each_possible_cpu(cpu) {
+ struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
- __cgroup_rstat_lock(cgrp, -1);
- cgroup_rstat_flush_locked(cgrp);
- __cgroup_rstat_unlock(cgrp, -1);
-}
+ /* Reacquire for each CPU to avoid disabling IRQs too long */
+ __cgroup_rstat_lock(cgrp, cpu);
+ for (; pos; pos = pos->rstat_flush_next) {
+ struct cgroup_subsys_state *css;
-/**
- * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
- * @cgrp: target cgroup
- *
- * Flush stats in @cgrp's subtree and prevent further flushes. Must be
- * paired with cgroup_rstat_flush_release().
- *
- * This function may block.
- */
-void cgroup_rstat_flush_hold(struct cgroup *cgrp)
- __acquires(&cgroup_rstat_lock)
-{
- might_sleep();
- __cgroup_rstat_lock(cgrp, -1);
- cgroup_rstat_flush_locked(cgrp);
-}
+ cgroup_base_stat_flush(pos, cpu);
+ bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
-/**
- * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
- * @cgrp: cgroup used by tracepoint
- */
-void cgroup_rstat_flush_release(struct cgroup *cgrp)
- __releases(&cgroup_rstat_lock)
-{
- __cgroup_rstat_unlock(cgrp, -1);
+ rcu_read_lock();
+ list_for_each_entry_rcu(css, &pos->rstat_css_list,
+ rstat_css_node)
+ css->ss->css_rstat_flush(css, cpu);
+ rcu_read_unlock();
+ }
+ __cgroup_rstat_unlock(cgrp, cpu);
+ if (!cond_resched())
+ cpu_relax();
+ }
}
int cgroup_rstat_init(struct cgroup *cgrp)
@@ -612,36 +570,34 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- u64 usage, utime, stime, ntime;
+ struct cgroup_base_stat bstat;
if (cgroup_parent(cgrp)) {
- cgroup_rstat_flush_hold(cgrp);
- usage = cgrp->bstat.cputime.sum_exec_runtime;
+ cgroup_rstat_flush(cgrp);
+ __cgroup_rstat_lock(cgrp, -1);
+ bstat = cgrp->bstat;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
- &utime, &stime);
- ntime = cgrp->bstat.ntime;
- cgroup_rstat_flush_release(cgrp);
+ &bstat.cputime.utime, &bstat.cputime.stime);
+ __cgroup_rstat_unlock(cgrp, -1);
} else {
- /* cgrp->bstat of root is not actually used, reuse it */
- root_cgroup_cputime(&cgrp->bstat);
- usage = cgrp->bstat.cputime.sum_exec_runtime;
- utime = cgrp->bstat.cputime.utime;
- stime = cgrp->bstat.cputime.stime;
- ntime = cgrp->bstat.ntime;
+ root_cgroup_cputime(&bstat);
}
- do_div(usage, NSEC_PER_USEC);
- do_div(utime, NSEC_PER_USEC);
- do_div(stime, NSEC_PER_USEC);
- do_div(ntime, NSEC_PER_USEC);
+ do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC);
+ do_div(bstat.cputime.utime, NSEC_PER_USEC);
+ do_div(bstat.cputime.stime, NSEC_PER_USEC);
+ do_div(bstat.ntime, NSEC_PER_USEC);
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n"
"nice_usec %llu\n",
- usage, utime, stime, ntime);
+ bstat.cputime.sum_exec_runtime,
+ bstat.cputime.utime,
+ bstat.cputime.stime,
+ bstat.ntime);
- cgroup_force_idle_show(seq, &cgrp->bstat);
+ cgroup_force_idle_show(seq, &bstat);
}
/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index 3fabb8f55ef6..dd7c32fb5ac1 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -46,7 +46,7 @@ CONFIG_UBSAN_BOUNDS=y
# CONFIG_UBSAN_SHIFT is not set
# CONFIG_UBSAN_DIV_ZERO is not set
# CONFIG_UBSAN_UNREACHABLE is not set
-# CONFIG_UBSAN_SIGNED_WRAP is not set
+# CONFIG_UBSAN_INTEGER_WRAP is not set
# CONFIG_UBSAN_BOOL is not set
# CONFIG_UBSAN_ENUM is not set
# CONFIG_UBSAN_ALIGNMENT is not set
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 938c48952d26..fb5be6e9b423 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -80,17 +80,16 @@ static __always_inline void rcu_task_trace_heavyweight_exit(void)
*/
static noinstr void ct_kernel_exit_state(int offset)
{
- int seq;
-
/*
* CPUs seeing atomic_add_return() must see prior RCU read-side
* critical sections, and we also must force ordering with the
* next idle sojourn.
*/
rcu_task_trace_heavyweight_enter(); // Before CT state update!
- seq = ct_state_inc(offset);
- // RCU is no longer watching. Better be in extended quiescent state!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING));
+ // RCU is still watching. Better not be in extended quiescent state!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu());
+ (void)ct_state_inc(offset);
+ // RCU is no longer watching.
}
/*
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 07455d25329c..b08bb34b1718 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -526,6 +526,7 @@ void lockdep_assert_cpus_held(void)
percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
+EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held);
#ifdef CONFIG_LOCKDEP
int lockdep_is_cpus_held(void)
@@ -1453,11 +1454,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
out:
cpus_write_unlock();
- /*
- * Do post unplug cleanup. This is still protected against
- * concurrent CPU hotplug via cpu_add_remove_lock.
- */
- lockup_detector_cleanup();
arch_smt_update();
return ret;
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 078fe5bc5a74..335b8425dd4b 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -436,7 +436,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
memset(&prstatus, 0, sizeof(prstatus));
prstatus.common.pr_pid = current->pid;
elf_core_copy_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS,
&prstatus, sizeof(prstatus));
final_note(buf);
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 5b4e6d3bf7bc..b8fe0b3d0ffb 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -584,6 +584,22 @@ int dma_direct_supported(struct device *dev, u64 mask)
return mask >= phys_to_dma_unencrypted(dev, min_mask);
}
+static const struct bus_dma_region *dma_find_range(struct device *dev,
+ unsigned long start_pfn)
+{
+ const struct bus_dma_region *m;
+
+ for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) {
+ unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start);
+
+ if (start_pfn >= cpu_start_pfn &&
+ start_pfn - cpu_start_pfn < PFN_DOWN(m->size))
+ return m;
+ }
+
+ return NULL;
+}
+
/*
* To check whether all ram resource ranges are covered by dma range map
* Returns 0 when further check is needed
@@ -593,20 +609,12 @@ static int check_ram_in_range_map(unsigned long start_pfn,
unsigned long nr_pages, void *data)
{
unsigned long end_pfn = start_pfn + nr_pages;
- const struct bus_dma_region *bdr = NULL;
- const struct bus_dma_region *m;
struct device *dev = data;
while (start_pfn < end_pfn) {
- for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) {
- unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start);
+ const struct bus_dma_region *bdr;
- if (start_pfn >= cpu_start_pfn &&
- start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) {
- bdr = m;
- break;
- }
- }
+ bdr = dma_find_range(dev, start_pfn);
if (!bdr)
return 1;
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
index 095c775e001e..d4b8bd0af79b 100644
--- a/kernel/entry/Makefile
+++ b/kernel/entry/Makefile
@@ -6,6 +6,9 @@ KASAN_SANITIZE := n
UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
+# Branch profiling isn't noinstr-safe
+ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING
+
CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong
CFLAGS_common.o += -fno-stack-protector
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index e33691d5adf7..20154572ede9 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -49,7 +49,7 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Do seccomp after ptrace, to catch any tracer changes. */
if (work & SYSCALL_WORK_SECCOMP) {
- ret = __secure_computing(NULL);
+ ret = __secure_computing();
if (ret == -1L)
return ret;
}
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 8a47e52a454f..6c83ad674d01 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -22,6 +22,7 @@ struct callchain_cpus_entries {
int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK;
+static const int six_hundred_forty_kb = 640 * 1024;
static inline size_t perf_callchain_entry__sizeof(void)
{
@@ -266,12 +267,8 @@ exit_put:
return entry;
}
-/*
- * Used for sysctl_perf_event_max_stack and
- * sysctl_perf_event_max_contexts_per_stack.
- */
-int perf_event_max_stack_handler(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int perf_event_max_stack_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *value = table->data;
int new_value = *value, ret;
@@ -292,3 +289,32 @@ int perf_event_max_stack_handler(const struct ctl_table *table, int write,
return ret;
}
+
+static const struct ctl_table callchain_sysctl_table[] = {
+ {
+ .procname = "perf_event_max_stack",
+ .data = &sysctl_perf_event_max_stack,
+ .maxlen = sizeof(sysctl_perf_event_max_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&six_hundred_forty_kb,
+ },
+ {
+ .procname = "perf_event_max_contexts_per_stack",
+ .data = &sysctl_perf_event_max_contexts_per_stack,
+ .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_THOUSAND,
+ },
+};
+
+static int __init init_callchain_sysctls(void)
+{
+ register_sysctl_init("kernel", callchain_sysctl_table);
+ return 0;
+}
+core_initcall(init_callchain_sysctls);
+
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 823aa0824916..0bb21659e252 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -55,6 +55,7 @@
#include <linux/pgtable.h>
#include <linux/buildid.h>
#include <linux/task_work.h>
+#include <linux/percpu-rwsem.h>
#include "internal.h"
@@ -452,8 +453,8 @@ static struct kmem_cache *perf_event_cache;
*/
int sysctl_perf_event_paranoid __read_mostly = 2;
-/* Minimum for 512 kiB + 1 user control page */
-int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
+/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */
+static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
/*
* max perf event sample rate
@@ -463,6 +464,7 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
#define DEFAULT_CPU_TIME_MAX_PERCENT 25
int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
@@ -484,7 +486,7 @@ static void update_perf_cpu_limits(void)
static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
-int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
+static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -506,9 +508,7 @@ int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
return 0;
}
-int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
-
-int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
+static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -528,6 +528,52 @@ int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
return 0;
}
+static const struct ctl_table events_core_sysctl_table[] = {
+ /*
+ * User-space relies on this file as a feature check for
+ * perf_events being enabled. It's an ABI, do not remove!
+ */
+ {
+ .procname = "perf_event_paranoid",
+ .data = &sysctl_perf_event_paranoid,
+ .maxlen = sizeof(sysctl_perf_event_paranoid),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "perf_event_mlock_kb",
+ .data = &sysctl_perf_event_mlock,
+ .maxlen = sizeof(sysctl_perf_event_mlock),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "perf_event_max_sample_rate",
+ .data = &sysctl_perf_event_sample_rate,
+ .maxlen = sizeof(sysctl_perf_event_sample_rate),
+ .mode = 0644,
+ .proc_handler = perf_event_max_sample_rate_handler,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "perf_cpu_time_max_percent",
+ .data = &sysctl_perf_cpu_time_max_percent,
+ .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
+ .mode = 0644,
+ .proc_handler = perf_cpu_time_max_percent_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+};
+
+static int __init init_events_core_sysctls(void)
+{
+ register_sysctl_init("kernel", events_core_sysctl_table);
+ return 0;
+}
+core_initcall(init_events_core_sysctls);
+
+
/*
* perf samples are done in some very critical code paths (NMIs).
* If they take too much CPU time, the system can lock up and not
@@ -1147,8 +1193,8 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
raw_spin_lock_init(&cpc->hrtimer_lock);
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- timer->function = perf_mux_hrtimer_handler;
+ hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED_HARD);
}
static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
@@ -1172,42 +1218,40 @@ static int perf_mux_hrtimer_restart_ipi(void *arg)
return perf_mux_hrtimer_restart(arg);
}
+static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu)
+{
+ return *this_cpu_ptr(pmu->cpu_pmu_context);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
- int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
if (!(*count)++)
pmu->pmu_disable(pmu);
}
void perf_pmu_enable(struct pmu *pmu)
{
- int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
if (!--(*count))
pmu->pmu_enable(pmu);
}
static void perf_assert_pmu_disabled(struct pmu *pmu)
{
- WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
-}
-
-static void get_ctx(struct perf_event_context *ctx)
-{
- refcount_inc(&ctx->refcount);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
+ WARN_ON_ONCE(*count == 0);
}
-static void *alloc_task_ctx_data(struct pmu *pmu)
+static inline void perf_pmu_read(struct perf_event *event)
{
- if (pmu->task_ctx_cache)
- return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
-
- return NULL;
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
}
-static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
+static void get_ctx(struct perf_event_context *ctx)
{
- if (pmu->task_ctx_cache && task_ctx_data)
- kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
+ refcount_inc(&ctx->refcount);
}
static void free_ctx(struct rcu_head *head)
@@ -2303,7 +2347,7 @@ static void
event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_event_pmu_context *epc = event->pmu_ctx;
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
// XXX cpc serialization, probably per-cpu IRQ disabled
@@ -2444,9 +2488,8 @@ __perf_remove_from_context(struct perf_event *event,
pmu_ctx->rotate_necessary = 0;
if (ctx->task && ctx->is_active) {
- struct perf_cpu_pmu_context *cpc;
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu);
- cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
cpc->task_epc = NULL;
}
@@ -2584,7 +2627,7 @@ static int
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_event_pmu_context *epc = event->pmu_ctx;
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
int ret = 0;
WARN_ON_ONCE(event->ctx != ctx);
@@ -2691,7 +2734,7 @@ error:
static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
struct perf_event_pmu_context *epc = event->pmu_ctx;
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
/*
* Groups consisting entirely of software events can always go on.
@@ -3314,9 +3357,8 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
struct pmu *pmu = pmu_ctx->pmu;
if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
- struct perf_cpu_pmu_context *cpc;
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
- cpc = this_cpu_ptr(pmu->cpu_pmu_context);
WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
cpc->task_epc = NULL;
}
@@ -3473,8 +3515,7 @@ static void __perf_event_sync_stat(struct perf_event *event,
* we know the event must be on the current CPU, therefore we
* don't need to use it.
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE)
- event->pmu->read(event);
+ perf_pmu_read(event);
perf_event_update_time(event);
@@ -3522,52 +3563,17 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \
- for (pos1 = list_first_entry(head1, typeof(*pos1), member), \
- pos2 = list_first_entry(head2, typeof(*pos2), member); \
- !list_entry_is_head(pos1, head1, member) && \
- !list_entry_is_head(pos2, head2, member); \
- pos1 = list_next_entry(pos1, member), \
- pos2 = list_next_entry(pos2, member))
-
-static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
- struct perf_event_context *next_ctx)
-{
- struct perf_event_pmu_context *prev_epc, *next_epc;
-
- if (!prev_ctx->nr_task_data)
- return;
-
- double_list_for_each_entry(prev_epc, next_epc,
- &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
- pmu_ctx_entry) {
-
- if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
- continue;
-
- /*
- * PMU specific parts of task perf context can require
- * additional synchronization. As an example of such
- * synchronization see implementation details of Intel
- * LBR call stack data profiling;
- */
- if (prev_epc->pmu->swap_task_ctx)
- prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
- else
- swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
- }
-}
-
-static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
+static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
+ struct task_struct *task, bool sched_in)
{
struct perf_event_pmu_context *pmu_ctx;
struct perf_cpu_pmu_context *cpc;
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ cpc = this_cpc(pmu_ctx->pmu);
if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
- pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
+ pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in);
}
}
@@ -3630,17 +3636,16 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- perf_ctx_sched_task_cb(ctx, false);
- perf_event_swap_task_ctx_data(ctx, next_ctx);
+ perf_ctx_sched_task_cb(ctx, task, false);
perf_ctx_enable(ctx, false);
/*
* RCU_INIT_POINTER here is safe because we've not
* modified the ctx and the above modification of
- * ctx->task and ctx->task_ctx_data are immaterial
- * since those values are always verified under
- * ctx->lock which we're now holding.
+ * ctx->task is immaterial since this value is
+ * always verified under ctx->lock which we're now
+ * holding.
*/
RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
@@ -3660,7 +3665,7 @@ unlock:
perf_ctx_disable(ctx, false);
inside_switch:
- perf_ctx_sched_task_cb(ctx, false);
+ perf_ctx_sched_task_cb(ctx, task, false);
task_ctx_sched_out(ctx, NULL, EVENT_ALL);
perf_ctx_enable(ctx, false);
@@ -3673,7 +3678,7 @@ static DEFINE_PER_CPU(int, perf_sched_cb_usages);
void perf_sched_cb_dec(struct pmu *pmu)
{
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
this_cpu_dec(perf_sched_cb_usages);
barrier();
@@ -3685,7 +3690,7 @@ void perf_sched_cb_dec(struct pmu *pmu)
void perf_sched_cb_inc(struct pmu *pmu)
{
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
if (!cpc->sched_cb_usage++)
list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
@@ -3702,7 +3707,8 @@ void perf_sched_cb_inc(struct pmu *pmu)
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
-static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc,
+ struct task_struct *task, bool sched_in)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu;
@@ -3716,7 +3722,7 @@ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_i
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
- pmu->sched_task(cpc->task_epc, sched_in);
+ pmu->sched_task(cpc->task_epc, task, sched_in);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3734,7 +3740,7 @@ static void perf_pmu_sched_task(struct task_struct *prev,
return;
list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
- __perf_pmu_sched_task(cpc, sched_in);
+ __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in);
}
static void perf_event_switch(struct task_struct *task,
@@ -3802,7 +3808,7 @@ static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
if (!pmu_ctx->ctx->task)
return;
- cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ cpc = this_cpc(pmu_ctx->pmu);
WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
cpc->task_epc = pmu_ctx;
}
@@ -3930,11 +3936,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+
+ if (*perf_event_fasync(event))
+ event->pending_kill = POLL_HUP;
+
+ perf_event_wakeup(event);
} else {
- struct perf_cpu_pmu_context *cpc;
+ struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);
event->pmu_ctx->rotate_necessary = 1;
- cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
perf_mux_hrtimer_restart(cpc);
group_update_userpage(event);
}
@@ -4029,7 +4039,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
perf_ctx_lock(cpuctx, ctx);
perf_ctx_disable(ctx, false);
- perf_ctx_sched_task_cb(ctx, true);
+ perf_ctx_sched_task_cb(ctx, task, true);
perf_ctx_enable(ctx, false);
perf_ctx_unlock(cpuctx, ctx);
@@ -4060,7 +4070,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
perf_event_sched_in(cpuctx, ctx, NULL);
- perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
+ perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true);
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
perf_ctx_enable(&cpuctx->ctx, false);
@@ -4618,15 +4628,8 @@ static void __perf_event_read(void *info)
pmu->read(event);
- for_each_sibling_event(sub, event) {
- if (sub->state == PERF_EVENT_STATE_ACTIVE) {
- /*
- * Use sibling's PMU rather than @event's since
- * sibling could be on different (eg: software) PMU.
- */
- sub->pmu->read(sub);
- }
- }
+ for_each_sibling_event(sub, event)
+ perf_pmu_read(sub);
data->ret = pmu->commit_txn(pmu);
@@ -4883,7 +4886,7 @@ find_get_context(struct task_struct *task, struct perf_event *event)
if (!task) {
/* Must be root to operate on a CPU event: */
- err = perf_allow_cpu(&event->attr);
+ err = perf_allow_cpu();
if (err)
return ERR_PTR(err);
@@ -4951,7 +4954,6 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
struct perf_event *event)
{
struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc;
- void *task_ctx_data = NULL;
if (!ctx->task) {
/*
@@ -4961,11 +4963,14 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
*/
struct perf_cpu_pmu_context *cpc;
- cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
epc = &cpc->epc;
raw_spin_lock_irq(&ctx->lock);
if (!epc->ctx) {
- atomic_set(&epc->refcount, 1);
+ /*
+ * One extra reference for the pmu; see perf_pmu_free().
+ */
+ atomic_set(&epc->refcount, 2);
epc->embedded = 1;
list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
epc->ctx = ctx;
@@ -4981,14 +4986,6 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
if (!new)
return ERR_PTR(-ENOMEM);
- if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = alloc_task_ctx_data(pmu);
- if (!task_ctx_data) {
- kfree(new);
- return ERR_PTR(-ENOMEM);
- }
- }
-
__perf_init_event_pmu_context(new, pmu);
/*
@@ -5023,14 +5020,7 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
epc->ctx = ctx;
found_epc:
- if (task_ctx_data && !epc->task_ctx_data) {
- epc->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- ctx->nr_task_data++;
- }
raw_spin_unlock_irq(&ctx->lock);
-
- free_task_ctx_data(pmu, task_ctx_data);
kfree(new);
return epc;
@@ -5041,11 +5031,18 @@ static void get_pmu_ctx(struct perf_event_pmu_context *epc)
WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
}
+static void free_cpc_rcu(struct rcu_head *head)
+{
+ struct perf_cpu_pmu_context *cpc =
+ container_of(head, typeof(*cpc), epc.rcu_head);
+
+ kfree(cpc);
+}
+
static void free_epc_rcu(struct rcu_head *head)
{
struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
- kfree(epc->task_ctx_data);
kfree(epc);
}
@@ -5075,8 +5072,10 @@ static void put_pmu_ctx(struct perf_event_pmu_context *epc)
raw_spin_unlock_irqrestore(&ctx->lock, flags);
- if (epc->embedded)
+ if (epc->embedded) {
+ call_rcu(&epc->rcu_head, free_cpc_rcu);
return;
+ }
call_rcu(&epc->rcu_head, free_epc_rcu);
}
@@ -5152,6 +5151,225 @@ static void unaccount_freq_event(void)
atomic_dec(&nr_freq_events);
}
+
+static struct perf_ctx_data *
+alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
+{
+ struct perf_ctx_data *cd;
+
+ cd = kzalloc(sizeof(*cd), GFP_KERNEL);
+ if (!cd)
+ return NULL;
+
+ cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
+ if (!cd->data) {
+ kfree(cd);
+ return NULL;
+ }
+
+ cd->global = global;
+ cd->ctx_cache = ctx_cache;
+ refcount_set(&cd->refcount, 1);
+
+ return cd;
+}
+
+static void free_perf_ctx_data(struct perf_ctx_data *cd)
+{
+ kmem_cache_free(cd->ctx_cache, cd->data);
+ kfree(cd);
+}
+
+static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
+{
+ struct perf_ctx_data *cd;
+
+ cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
+ free_perf_ctx_data(cd);
+}
+
+static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
+{
+ call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
+}
+
+static int
+attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
+ bool global)
+{
+ struct perf_ctx_data *cd, *old = NULL;
+
+ cd = alloc_perf_ctx_data(ctx_cache, global);
+ if (!cd)
+ return -ENOMEM;
+
+ for (;;) {
+ if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
+ if (old)
+ perf_free_ctx_data_rcu(old);
+ return 0;
+ }
+
+ if (!old) {
+ /*
+ * After seeing a dead @old, we raced with
+ * removal and lost, try again to install @cd.
+ */
+ continue;
+ }
+
+ if (refcount_inc_not_zero(&old->refcount)) {
+ free_perf_ctx_data(cd); /* unused */
+ return 0;
+ }
+
+ /*
+ * @old is a dead object, refcount==0 is stable, try and
+ * replace it with @cd.
+ */
+ }
+ return 0;
+}
+
+static void __detach_global_ctx_data(void);
+DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
+static refcount_t global_ctx_data_ref;
+
+static int
+attach_global_ctx_data(struct kmem_cache *ctx_cache)
+{
+ struct task_struct *g, *p;
+ struct perf_ctx_data *cd;
+ int ret;
+
+ if (refcount_inc_not_zero(&global_ctx_data_ref))
+ return 0;
+
+ guard(percpu_write)(&global_ctx_data_rwsem);
+ if (refcount_inc_not_zero(&global_ctx_data_ref))
+ return 0;
+again:
+ /* Allocate everything */
+ scoped_guard (rcu) {
+ for_each_process_thread(g, p) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (cd && !cd->global) {
+ cd->global = 1;
+ if (!refcount_inc_not_zero(&cd->refcount))
+ cd = NULL;
+ }
+ if (!cd) {
+ get_task_struct(p);
+ goto alloc;
+ }
+ }
+ }
+
+ refcount_set(&global_ctx_data_ref, 1);
+
+ return 0;
+alloc:
+ ret = attach_task_ctx_data(p, ctx_cache, true);
+ put_task_struct(p);
+ if (ret) {
+ __detach_global_ctx_data();
+ return ret;
+ }
+ goto again;
+}
+
+static int
+attach_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+ struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
+ int ret;
+
+ if (!ctx_cache)
+ return -ENOMEM;
+
+ if (task)
+ return attach_task_ctx_data(task, ctx_cache, false);
+
+ ret = attach_global_ctx_data(ctx_cache);
+ if (ret)
+ return ret;
+
+ event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
+ return 0;
+}
+
+static void
+detach_task_ctx_data(struct task_struct *p)
+{
+ struct perf_ctx_data *cd;
+
+ scoped_guard (rcu) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (!cd || !refcount_dec_and_test(&cd->refcount))
+ return;
+ }
+
+ /*
+ * The old ctx_data may be lost because of the race.
+ * Nothing is required to do for the case.
+ * See attach_task_ctx_data().
+ */
+ if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
+ perf_free_ctx_data_rcu(cd);
+}
+
+static void __detach_global_ctx_data(void)
+{
+ struct task_struct *g, *p;
+ struct perf_ctx_data *cd;
+
+again:
+ scoped_guard (rcu) {
+ for_each_process_thread(g, p) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (!cd || !cd->global)
+ continue;
+ cd->global = 0;
+ get_task_struct(p);
+ goto detach;
+ }
+ }
+ return;
+detach:
+ detach_task_ctx_data(p);
+ put_task_struct(p);
+ goto again;
+}
+
+static void detach_global_ctx_data(void)
+{
+ if (refcount_dec_not_one(&global_ctx_data_ref))
+ return;
+
+ guard(percpu_write)(&global_ctx_data_rwsem);
+ if (!refcount_dec_and_test(&global_ctx_data_ref))
+ return;
+
+ /* remove everything */
+ __detach_global_ctx_data();
+}
+
+static void detach_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+
+ event->attach_state &= ~PERF_ATTACH_TASK_DATA;
+
+ if (task)
+ return detach_task_ctx_data(task);
+
+ if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
+ detach_global_ctx_data();
+ event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
+ }
+}
+
static void unaccount_event(struct perf_event *event)
{
bool dec = false;
@@ -5246,6 +5464,8 @@ static int exclusive_event_init(struct perf_event *event)
return -EBUSY;
}
+ event->attach_state |= PERF_ATTACH_EXCLUSIVE;
+
return 0;
}
@@ -5253,14 +5473,13 @@ static void exclusive_event_destroy(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
- if (!is_exclusive_pmu(pmu))
- return;
-
/* see comment in exclusive_event_init() */
if (event->attach_state & PERF_ATTACH_TASK)
atomic_dec(&pmu->exclusive_cnt);
else
atomic_inc(&pmu->exclusive_cnt);
+
+ event->attach_state &= ~PERF_ATTACH_EXCLUSIVE;
}
static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
@@ -5292,8 +5511,7 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
-static void perf_addr_filters_splice(struct perf_event *event,
- struct list_head *head);
+static void perf_free_addr_filters(struct perf_event *event);
static void perf_pending_task_sync(struct perf_event *event)
{
@@ -5319,39 +5537,22 @@ static void perf_pending_task_sync(struct perf_event *event)
rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
}
-static void _free_event(struct perf_event *event)
+/* vs perf_event_alloc() error */
+static void __free_event(struct perf_event *event)
{
- irq_work_sync(&event->pending_irq);
- irq_work_sync(&event->pending_disable_irq);
- perf_pending_task_sync(event);
-
- unaccount_event(event);
+ if (event->attach_state & PERF_ATTACH_CALLCHAIN)
+ put_callchain_buffers();
- security_perf_event_free(event);
+ kfree(event->addr_filter_ranges);
- if (event->rb) {
- /*
- * Can happen when we close an event with re-directed output.
- *
- * Since we have a 0 refcount, perf_mmap_close() will skip
- * over us; possibly making our ring_buffer_put() the last.
- */
- mutex_lock(&event->mmap_mutex);
- ring_buffer_attach(event, NULL);
- mutex_unlock(&event->mmap_mutex);
- }
+ if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
+ exclusive_event_destroy(event);
if (is_cgroup_event(event))
perf_detach_cgroup(event);
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-
- perf_event_free_bpf_prog(event);
- perf_addr_filters_splice(event, NULL);
- kfree(event->addr_filter_ranges);
+ if (event->attach_state & PERF_ATTACH_TASK_DATA)
+ detach_perf_ctx_data(event);
if (event->destroy)
event->destroy(event);
@@ -5363,22 +5564,60 @@ static void _free_event(struct perf_event *event)
if (event->hw.target)
put_task_struct(event->hw.target);
- if (event->pmu_ctx)
+ if (event->pmu_ctx) {
+ /*
+ * put_pmu_ctx() needs an event->ctx reference, because of
+ * epc->ctx.
+ */
+ WARN_ON_ONCE(!event->ctx);
+ WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
put_pmu_ctx(event->pmu_ctx);
+ }
/*
- * perf_event_free_task() relies on put_ctx() being 'last', in particular
- * all task references must be cleaned up.
+ * perf_event_free_task() relies on put_ctx() being 'last', in
+ * particular all task references must be cleaned up.
*/
if (event->ctx)
put_ctx(event->ctx);
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
+ if (event->pmu)
+ module_put(event->pmu->module);
call_rcu(&event->rcu_head, free_event_rcu);
}
+DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T))
+
+/* vs perf_event_alloc() success */
+static void _free_event(struct perf_event *event)
+{
+ irq_work_sync(&event->pending_irq);
+ irq_work_sync(&event->pending_disable_irq);
+ perf_pending_task_sync(event);
+
+ unaccount_event(event);
+
+ security_perf_event_free(event);
+
+ if (event->rb) {
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+ }
+
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
+
+ __free_event(event);
+}
+
/*
* Used to free events which have a known refcount of 1, such as in error paths
* where the event isn't exposed yet and inherited events.
@@ -5847,6 +6086,10 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
if (is_event_hup(event))
return events;
+ if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
+ event->attr.pinned))
+ return events;
+
/*
* Pin the event->rb by taking event->mmap_mutex; otherwise
* perf_event_set_output() can swizzle our rb and make us miss wakeups.
@@ -6616,7 +6859,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned long vma_size;
unsigned long nr_pages;
long user_extra = 0, extra = 0;
- int ret = 0, flags = 0;
+ int ret, flags = 0;
/*
* Don't allow mmap() of inherited per-task counters. This would
@@ -6634,9 +6877,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
return ret;
vma_size = vma->vm_end - vma->vm_start;
+ nr_pages = vma_size / PAGE_SIZE;
+
+ if (nr_pages > INT_MAX)
+ return -ENOMEM;
+
+ if (vma_size != PAGE_SIZE * nr_pages)
+ return -EINVAL;
+
+ user_extra = nr_pages;
+
+ mutex_lock(&event->mmap_mutex);
+ ret = -EINVAL;
if (vma->vm_pgoff == 0) {
- nr_pages = (vma_size / PAGE_SIZE) - 1;
+ nr_pages -= 1;
+
+ /*
+ * If we have rb pages ensure they're a power-of-two number, so we
+ * can do bitmasks instead of modulo.
+ */
+ if (nr_pages != 0 && !is_power_of_2(nr_pages))
+ goto unlock;
+
+ WARN_ON_ONCE(event->ctx->parent_ctx);
+
+ if (event->rb) {
+ if (data_page_nr(event->rb) != nr_pages)
+ goto unlock;
+
+ if (atomic_inc_not_zero(&event->rb->mmap_count)) {
+ /*
+ * Success -- managed to mmap() the same buffer
+ * multiple times.
+ */
+ ret = 0;
+ /* We need the rb to map pages. */
+ rb = event->rb;
+ goto unlock;
+ }
+
+ /*
+ * Raced against perf_mmap_close()'s
+ * atomic_dec_and_mutex_lock() remove the
+ * event and continue as if !event->rb
+ */
+ ring_buffer_attach(event, NULL);
+ }
+
} else {
/*
* AUX area mapping: if rb->aux_nr_pages != 0, it's already
@@ -6645,16 +6933,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
*/
u64 aux_offset, aux_size;
- if (!event->rb)
- return -EINVAL;
-
- nr_pages = vma_size / PAGE_SIZE;
- if (nr_pages > INT_MAX)
- return -ENOMEM;
-
- mutex_lock(&event->mmap_mutex);
- ret = -EINVAL;
-
rb = event->rb;
if (!rb)
goto aux_unlock;
@@ -6695,48 +6973,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
}
atomic_set(&rb->aux_mmap_count, 1);
- user_extra = nr_pages;
-
- goto accounting;
- }
-
- /*
- * If we have rb pages ensure they're a power-of-two number, so we
- * can do bitmasks instead of modulo.
- */
- if (nr_pages != 0 && !is_power_of_2(nr_pages))
- return -EINVAL;
-
- if (vma_size != PAGE_SIZE * (1 + nr_pages))
- return -EINVAL;
-
- WARN_ON_ONCE(event->ctx->parent_ctx);
-again:
- mutex_lock(&event->mmap_mutex);
- if (event->rb) {
- if (data_page_nr(event->rb) != nr_pages) {
- ret = -EINVAL;
- goto unlock;
- }
-
- if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
- /*
- * Raced against perf_mmap_close(); remove the
- * event and try again.
- */
- ring_buffer_attach(event, NULL);
- mutex_unlock(&event->mmap_mutex);
- goto again;
- }
-
- /* We need the rb to map pages. */
- rb = event->rb;
- goto unlock;
}
- user_extra = nr_pages + 1;
-
-accounting:
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
/*
@@ -6804,6 +7042,8 @@ accounting:
rb->aux_mmap_locked = extra;
}
+ ret = 0;
+
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
@@ -6828,7 +7068,7 @@ aux_unlock:
if (!ret)
ret = map_range(rb, vma);
- if (event->pmu->event_mapped)
+ if (!ret && event->pmu->event_mapped)
event->pmu->event_mapped(event, vma->vm_mm);
return ret;
@@ -7452,9 +7692,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
- if ((leader != event) &&
- (leader->state == PERF_EVENT_STATE_ACTIVE))
- leader->pmu->read(leader);
+ if ((leader != event) && !handle->skip_read)
+ perf_pmu_read(leader);
values[n++] = perf_event_count(leader, self);
if (read_format & PERF_FORMAT_ID)
@@ -7467,9 +7706,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
for_each_sibling_event(sub, leader) {
n = 0;
- if ((sub != event) &&
- (sub->state == PERF_EVENT_STATE_ACTIVE))
- sub->pmu->read(sub);
+ if ((sub != event) && !handle->skip_read)
+ perf_pmu_read(sub);
values[n++] = perf_event_count(sub, self);
if (read_format & PERF_FORMAT_ID)
@@ -7528,6 +7766,9 @@ void perf_output_sample(struct perf_output_handle *handle,
{
u64 sample_type = data->type;
+ if (data->sample_flags & PERF_SAMPLE_READ)
+ handle->skip_read = 1;
+
perf_output_put(handle, *header);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
@@ -8522,10 +8763,58 @@ static void perf_event_task(struct task_struct *task,
task_ctx);
}
+/*
+ * Allocate data for a new task when profiling system-wide
+ * events which require PMU specific data
+ */
+static void
+perf_event_alloc_task_data(struct task_struct *child,
+ struct task_struct *parent)
+{
+ struct kmem_cache *ctx_cache = NULL;
+ struct perf_ctx_data *cd;
+
+ if (!refcount_read(&global_ctx_data_ref))
+ return;
+
+ scoped_guard (rcu) {
+ cd = rcu_dereference(parent->perf_ctx_data);
+ if (cd)
+ ctx_cache = cd->ctx_cache;
+ }
+
+ if (!ctx_cache)
+ return;
+
+ guard(percpu_read)(&global_ctx_data_rwsem);
+ scoped_guard (rcu) {
+ cd = rcu_dereference(child->perf_ctx_data);
+ if (!cd) {
+ /*
+ * A system-wide event may be unaccount,
+ * when attaching the perf_ctx_data.
+ */
+ if (!refcount_read(&global_ctx_data_ref))
+ return;
+ goto attach;
+ }
+
+ if (!cd->global) {
+ cd->global = 1;
+ refcount_inc(&cd->refcount);
+ }
+ }
+
+ return;
+attach:
+ attach_task_ctx_data(child, ctx_cache, true);
+}
+
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
perf_event_namespaces(task);
+ perf_event_alloc_task_data(task, current);
}
/*
@@ -8589,7 +8878,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
unsigned int size;
memset(comm, 0, sizeof(comm));
- strscpy(comm, comm_event->task->comm, sizeof(comm));
+ strscpy(comm, comm_event->task->comm);
size = ALIGN(strlen(comm)+1, sizeof(u64));
comm_event->comm = comm;
@@ -9033,7 +9322,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
}
cpy_name:
- strscpy(tmp, name, sizeof(tmp));
+ strscpy(tmp, name);
name = tmp;
got_name:
/*
@@ -9457,7 +9746,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
goto err;
- strscpy(name, sym, KSYM_NAME_LEN);
+ strscpy(name, sym);
name_len = strlen(name) + 1;
while (!IS_ALIGNED(name_len, sizeof(u64)))
name[name_len++] = '\0';
@@ -10840,6 +11129,9 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
void perf_event_free_bpf_prog(struct perf_event *event)
{
+ if (!event->prog)
+ return;
+
if (!perf_event_is_tracing(event)) {
perf_event_free_bpf_handler(event);
return;
@@ -10938,6 +11230,17 @@ static void perf_addr_filters_splice(struct perf_event *event,
free_filters_list(&list);
}
+static void perf_free_addr_filters(struct perf_event *event)
+{
+ /*
+ * Used during free paths, there is no concurrency.
+ */
+ if (list_empty(&event->addr_filters.list))
+ return;
+
+ perf_addr_filters_splice(event, NULL);
+}
+
/*
* Scan through mm's vmas and see if one of them matches the
* @filter; if so, adjust filter's address range.
@@ -11376,8 +11679,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
if (!is_sampling_event(event))
return;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- hwc->hrtimer.function = perf_swevent_hrtimer;
+ hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
/*
* Since hrtimers have a fixed rate, we can do a static freq->period
@@ -11614,11 +11916,6 @@ static int perf_event_idx_default(struct perf_event *event)
return 0;
}
-static void free_pmu_context(struct pmu *pmu)
-{
- free_percpu(pmu->cpu_pmu_context);
-}
-
/*
* Let userspace know that this PMU supports address range filtering:
*/
@@ -11628,7 +11925,7 @@ static ssize_t nr_addr_filters_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+ return sysfs_emit(page, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);
@@ -11639,7 +11936,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct pmu *pmu = dev_get_drvdata(dev);
- return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type);
+ return sysfs_emit(page, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);
@@ -11650,7 +11947,7 @@ perf_event_mux_interval_ms_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms);
+ return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms);
}
static DEFINE_MUTEX(mux_interval_mutex);
@@ -11681,7 +11978,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
cpus_read_lock();
for_each_online_cpu(cpu) {
struct perf_cpu_pmu_context *cpc;
- cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
@@ -11824,6 +12121,7 @@ del_dev:
free_dev:
put_device(pmu->dev);
+ pmu->dev = NULL;
goto out;
}
@@ -11845,57 +12143,85 @@ static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new)
return true;
}
-int perf_pmu_register(struct pmu *pmu, const char *name, int type)
+static void perf_pmu_free(struct pmu *pmu)
{
- int cpu, ret, max = PERF_TYPE_MAX;
+ if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
+ if (pmu->nr_addr_filters)
+ device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ }
- mutex_lock(&pmus_lock);
- ret = -ENOMEM;
- pmu->pmu_disable_count = alloc_percpu(int);
- if (!pmu->pmu_disable_count)
- goto unlock;
+ if (pmu->cpu_pmu_context) {
+ int cpu;
- pmu->type = -1;
- if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
- ret = -EINVAL;
- goto free_pdc;
- }
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_pmu_context *cpc;
- if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
- ret = -EINVAL;
- goto free_pdc;
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ if (!cpc)
+ continue;
+ if (cpc->epc.embedded) {
+ /* refcount managed */
+ put_pmu_ctx(&cpc->epc);
+ continue;
+ }
+ kfree(cpc);
+ }
+ free_percpu(pmu->cpu_pmu_context);
}
+}
+
+DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T))
+
+int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
+{
+ int cpu, max = PERF_TYPE_MAX;
+
+ struct pmu *pmu __free(pmu_unregister) = _pmu;
+ guard(mutex)(&pmus_lock);
+
+ if (WARN_ONCE(!name, "Can not register anonymous pmu.\n"))
+ return -EINVAL;
+
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE,
+ "Can not register a pmu with an invalid scope.\n"))
+ return -EINVAL;
pmu->name = name;
if (type >= 0)
max = type;
- ret = idr_alloc(&pmu_idr, NULL, max, 0, GFP_KERNEL);
- if (ret < 0)
- goto free_pdc;
+ CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL);
+ if (pmu_type.id < 0)
+ return pmu_type.id;
- WARN_ON(type >= 0 && ret != type);
+ WARN_ON(type >= 0 && pmu_type.id != type);
- type = ret;
- pmu->type = type;
+ pmu->type = pmu_type.id;
atomic_set(&pmu->exclusive_cnt, 0);
if (pmu_bus_running && !pmu->dev) {
- ret = pmu_dev_alloc(pmu);
+ int ret = pmu_dev_alloc(pmu);
if (ret)
- goto free_idr;
+ return ret;
}
- ret = -ENOMEM;
- pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
+ pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *);
if (!pmu->cpu_pmu_context)
- goto free_dev;
+ return -ENOMEM;
for_each_possible_cpu(cpu) {
- struct perf_cpu_pmu_context *cpc;
+ struct perf_cpu_pmu_context *cpc =
+ kmalloc_node(sizeof(struct perf_cpu_pmu_context),
+ GFP_KERNEL | __GFP_ZERO,
+ cpu_to_node(cpu));
- cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ if (!cpc)
+ return -ENOMEM;
+
+ *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc;
__perf_init_event_pmu_context(&cpc->epc, pmu);
__perf_mux_hrtimer_init(cpc, cpu);
}
@@ -11932,39 +12258,21 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
* Now that the PMU is complete, make it visible to perf_try_init_event().
*/
if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu))
- goto free_context;
+ return -EINVAL;
list_add_rcu(&pmu->entry, &pmus);
- ret = 0;
-unlock:
- mutex_unlock(&pmus_lock);
-
- return ret;
-
-free_context:
- free_percpu(pmu->cpu_pmu_context);
-
-free_dev:
- if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
- device_del(pmu->dev);
- put_device(pmu->dev);
- }
-
-free_idr:
- idr_remove(&pmu_idr, pmu->type);
-
-free_pdc:
- free_percpu(pmu->pmu_disable_count);
- goto unlock;
+ take_idr_id(pmu_type);
+ _pmu = no_free_ptr(pmu); // let it rip
+ return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
- mutex_lock(&pmus_lock);
- list_del_rcu(&pmu->entry);
- idr_remove(&pmu_idr, pmu->type);
- mutex_unlock(&pmus_lock);
+ scoped_guard (mutex, &pmus_lock) {
+ list_del_rcu(&pmu->entry);
+ idr_remove(&pmu_idr, pmu->type);
+ }
/*
* We dereference the pmu list under both SRCU and regular RCU, so
@@ -11973,14 +12281,7 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
- free_percpu(pmu->pmu_disable_count);
- if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
- if (pmu->nr_addr_filters)
- device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
- device_del(pmu->dev);
- put_device(pmu->dev);
- }
- free_pmu_context(pmu);
+ perf_pmu_free(pmu);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -12020,48 +12321,61 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (ctx)
perf_event_ctx_unlock(event->group_leader, ctx);
- if (!ret) {
- if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
- has_extended_regs(event))
- ret = -EOPNOTSUPP;
+ if (ret)
+ goto err_pmu;
- if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
- event_has_any_exclude_flag(event))
- ret = -EINVAL;
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+ has_extended_regs(event)) {
+ ret = -EOPNOTSUPP;
+ goto err_destroy;
+ }
- if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
- const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
- struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
- int cpu;
-
- if (pmu_cpumask && cpumask) {
- cpu = cpumask_any_and(pmu_cpumask, cpumask);
- if (cpu >= nr_cpu_ids)
- ret = -ENODEV;
- else
- event->event_caps |= PERF_EV_CAP_READ_SCOPE;
- } else {
- ret = -ENODEV;
- }
- }
+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
+ event_has_any_exclude_flag(event)) {
+ ret = -EINVAL;
+ goto err_destroy;
+ }
+
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ const struct cpumask *cpumask;
+ struct cpumask *pmu_cpumask;
+ int cpu;
- if (ret && event->destroy)
- event->destroy(event);
+ cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ pmu_cpumask = perf_scope_cpumask(pmu->scope);
+
+ ret = -ENODEV;
+ if (!pmu_cpumask || !cpumask)
+ goto err_destroy;
+
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ if (cpu >= nr_cpu_ids)
+ goto err_destroy;
+
+ event->event_caps |= PERF_EV_CAP_READ_SCOPE;
}
- if (ret)
- module_put(pmu->module);
+ return 0;
+
+err_destroy:
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+err_pmu:
+ event->pmu = NULL;
+ module_put(pmu->module);
return ret;
}
static struct pmu *perf_init_event(struct perf_event *event)
{
bool extended_type = false;
- int idx, type, ret;
struct pmu *pmu;
+ int type, ret;
- idx = srcu_read_lock(&pmus_srcu);
+ guard(srcu)(&pmus_srcu);
/*
* Save original type before calling pmu->event_init() since certain
@@ -12074,7 +12388,7 @@ static struct pmu *perf_init_event(struct perf_event *event)
pmu = event->parent->pmu;
ret = perf_try_init_event(pmu, event);
if (!ret)
- goto unlock;
+ return pmu;
}
/*
@@ -12093,13 +12407,12 @@ static struct pmu *perf_init_event(struct perf_event *event)
}
again:
- rcu_read_lock();
- pmu = idr_find(&pmu_idr, type);
- rcu_read_unlock();
+ scoped_guard (rcu)
+ pmu = idr_find(&pmu_idr, type);
if (pmu) {
if (event->attr.type != type && type != PERF_TYPE_RAW &&
!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
- goto fail;
+ return ERR_PTR(-ENOENT);
ret = perf_try_init_event(pmu, event);
if (ret == -ENOENT && event->attr.type != type && !extended_type) {
@@ -12108,27 +12421,21 @@ again:
}
if (ret)
- pmu = ERR_PTR(ret);
+ return ERR_PTR(ret);
- goto unlock;
+ return pmu;
}
list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
ret = perf_try_init_event(pmu, event);
if (!ret)
- goto unlock;
+ return pmu;
- if (ret != -ENOENT) {
- pmu = ERR_PTR(ret);
- goto unlock;
- }
+ if (ret != -ENOENT)
+ return ERR_PTR(ret);
}
-fail:
- pmu = ERR_PTR(-ENOENT);
-unlock:
- srcu_read_unlock(&pmus_srcu, idx);
- return pmu;
+ return ERR_PTR(-ENOENT);
}
static void attach_sb_event(struct perf_event *event)
@@ -12255,7 +12562,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
void *context, int cgroup_fd)
{
struct pmu *pmu;
- struct perf_event *event;
struct hw_perf_event *hwc;
long err = -EINVAL;
int node;
@@ -12270,8 +12576,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
- event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
- node);
+ struct perf_event *event __free(__free_event) =
+ kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node);
if (!event)
return ERR_PTR(-ENOMEM);
@@ -12378,15 +12684,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* See perf_output_read().
*/
if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
- goto err_ns;
+ return ERR_PTR(-EINVAL);
if (!has_branch_stack(event))
event->attr.branch_sample_type = 0;
pmu = perf_init_event(event);
- if (IS_ERR(pmu)) {
- err = PTR_ERR(pmu);
- goto err_ns;
+ if (IS_ERR(pmu))
+ return (void*)pmu;
+
+ /*
+ * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
+ * The attach should be right after the perf_init_event().
+ * Otherwise, the __free_event() would mistakenly detach the non-exist
+ * perf_ctx_data because of the other errors between them.
+ */
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ err = attach_perf_ctx_data(event);
+ if (err)
+ return ERR_PTR(err);
}
/*
@@ -12394,49 +12710,39 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* events (they don't make sense as the cgroup will be different
* on other CPUs in the uncore mask).
*/
- if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
- err = -EINVAL;
- goto err_pmu;
- }
+ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1))
+ return ERR_PTR(-EINVAL);
if (event->attr.aux_output &&
(!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
- event->attr.aux_pause || event->attr.aux_resume)) {
- err = -EOPNOTSUPP;
- goto err_pmu;
- }
+ event->attr.aux_pause || event->attr.aux_resume))
+ return ERR_PTR(-EOPNOTSUPP);
- if (event->attr.aux_pause && event->attr.aux_resume) {
- err = -EINVAL;
- goto err_pmu;
- }
+ if (event->attr.aux_pause && event->attr.aux_resume)
+ return ERR_PTR(-EINVAL);
if (event->attr.aux_start_paused) {
- if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) {
- err = -EOPNOTSUPP;
- goto err_pmu;
- }
+ if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
+ return ERR_PTR(-EOPNOTSUPP);
event->hw.aux_paused = 1;
}
if (cgroup_fd != -1) {
err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
if (err)
- goto err_pmu;
+ return ERR_PTR(err);
}
err = exclusive_event_init(event);
if (err)
- goto err_pmu;
+ return ERR_PTR(err);
if (has_addr_filter(event)) {
event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
sizeof(struct perf_addr_filter_range),
GFP_KERNEL);
- if (!event->addr_filter_ranges) {
- err = -ENOMEM;
- goto err_per_task;
- }
+ if (!event->addr_filter_ranges)
+ return ERR_PTR(-ENOMEM);
/*
* Clone the parent's vma offsets: they are valid until exec()
@@ -12460,42 +12766,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers(attr->sample_max_stack);
if (err)
- goto err_addr_filters;
+ return ERR_PTR(err);
+ event->attach_state |= PERF_ATTACH_CALLCHAIN;
}
}
err = security_perf_event_alloc(event);
if (err)
- goto err_callchain_buffer;
+ return ERR_PTR(err);
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
- return event;
-
-err_callchain_buffer:
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-err_addr_filters:
- kfree(event->addr_filter_ranges);
-
-err_per_task:
- exclusive_event_destroy(event);
-
-err_pmu:
- if (is_cgroup_event(event))
- perf_detach_cgroup(event);
- if (event->destroy)
- event->destroy(event);
- module_put(pmu->module);
-err_ns:
- if (event->hw.target)
- put_task_struct(event->hw.target);
- call_rcu(&event->rcu_head, free_event_rcu);
-
- return ERR_PTR(err);
+ return_ptr(event);
}
static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -12565,7 +12848,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
}
/* privileged levels capture (kernel, hv): check permissions */
if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
- ret = perf_allow_kernel(attr);
+ ret = perf_allow_kernel();
if (ret)
return ret;
}
@@ -12822,12 +13105,12 @@ SYSCALL_DEFINE5(perf_event_open,
return err;
/* Do we allow access to perf_event_open(2) ? */
- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ err = security_perf_event_open(PERF_SECURITY_OPEN);
if (err)
return err;
if (!attr.exclude_kernel) {
- err = perf_allow_kernel(&attr);
+ err = perf_allow_kernel();
if (err)
return err;
}
@@ -12847,7 +13130,7 @@ SYSCALL_DEFINE5(perf_event_open,
/* Only privileged users can get physical addresses */
if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
- err = perf_allow_kernel(&attr);
+ err = perf_allow_kernel();
if (err)
return err;
}
@@ -13569,6 +13852,12 @@ void perf_event_exit_task(struct task_struct *child)
* At this point we need to send EXIT events to cpu contexts.
*/
perf_event_task(child, NULL, 0);
+
+ /*
+ * Detach the perf_ctx_data for the system-wide event.
+ */
+ guard(percpu_read)(&global_ctx_data_rwsem);
+ detach_task_ctx_data(child);
}
static void perf_free_event(struct perf_event *event,
@@ -13680,12 +13969,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
return &event->attr;
}
-int perf_allow_kernel(struct perf_event_attr *attr)
+int perf_allow_kernel(void)
{
if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
return -EACCES;
- return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
+ return security_perf_event_open(PERF_SECURITY_KERNEL);
}
EXPORT_SYMBOL_GPL(perf_allow_kernel);
@@ -13744,7 +14033,6 @@ inherit_event(struct perf_event *parent_event,
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
mutex_unlock(&parent_event->child_mutex);
- /* task_ctx_data is freed with child_ctx */
free_event(child_event);
return NULL;
}
@@ -14002,6 +14290,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags)
child->perf_event_ctxp = NULL;
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
+ child->perf_ctx_data = NULL;
ret = perf_event_init_context(child, clone_flags);
if (ret) {
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index bc4a61029b6d..8ec2cb688903 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -950,9 +950,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
return -ENOENT;
/*
- * no branch sampling for breakpoint events
+ * Check if breakpoint type is supported before proceeding.
+ * Also, no branch sampling for breakpoint events.
*/
- if (has_branch_stack(bp))
+ if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp))
return -EOPNOTSUPP;
err = register_perf_hw_breakpoint(bp);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 180509132d4b..5130b119d0ae 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -19,7 +19,7 @@
static void perf_output_wakeup(struct perf_output_handle *handle)
{
- atomic_set(&handle->rb->poll, EPOLLIN);
+ atomic_set(&handle->rb->poll, EPOLLIN | EPOLLRDNORM);
handle->event->pending_wakeup = 1;
@@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle,
handle->rb = rb;
handle->event = event;
+ handle->flags = 0;
have_lost = local_read(&rb->lost);
if (unlikely(have_lost)) {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b4ca8898fe17..70c84b9d7be3 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2169,8 +2169,8 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
*/
unsigned long uprobe_get_trampoline_vaddr(void)
{
+ unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR;
struct xol_area *area;
- unsigned long trampoline_vaddr = -1;
/* Pairs with xol_add_vma() smp_store_release() */
area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
@@ -2311,9 +2311,8 @@ bool uprobe_deny_signal(void)
WARN_ON_ONCE(utask->state != UTASK_SSTEP);
if (task_sigpending(t)) {
- spin_lock_irq(&t->sighand->siglock);
+ utask->signal_denied = true;
clear_tsk_thread_flag(t, TIF_SIGPENDING);
- spin_unlock_irq(&t->sighand->siglock);
if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
utask->state = UTASK_SSTEP_TRAPPED;
@@ -2746,9 +2745,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
utask->state = UTASK_RUNNING;
xol_free_insn_slot(utask);
- spin_lock_irq(&current->sighand->siglock);
- recalc_sigpending(); /* see uprobe_deny_signal() */
- spin_unlock_irq(&current->sighand->siglock);
+ if (utask->signal_denied) {
+ set_thread_flag(TIF_SIGPENDING);
+ utask->signal_denied = false;
+ }
if (unlikely(err)) {
uprobe_warn(current, "execute the probed insn, sending SIGILL.");
diff --git a/kernel/exit.c b/kernel/exit.c
index 3485e5fc499e..c2e6c7b7779f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,6 +69,7 @@
#include <linux/sysfs.h>
#include <linux/user_events.h>
#include <linux/uaccess.h>
+#include <linux/pidfs.h>
#include <uapi/linux/wait.h>
@@ -122,14 +123,22 @@ static __init int kernel_exit_sysfs_init(void)
late_initcall(kernel_exit_sysfs_init);
#endif
-static void __unhash_process(struct task_struct *p, bool group_dead)
+/*
+ * For things release_task() would like to do *after* tasklist_lock is released.
+ */
+struct release_task_post {
+ struct pid *pids[PIDTYPE_MAX];
+};
+
+static void __unhash_process(struct release_task_post *post, struct task_struct *p,
+ bool group_dead)
{
nr_threads--;
- detach_pid(p, PIDTYPE_PID);
+ detach_pid(post->pids, p, PIDTYPE_PID);
if (group_dead) {
- detach_pid(p, PIDTYPE_TGID);
- detach_pid(p, PIDTYPE_PGID);
- detach_pid(p, PIDTYPE_SID);
+ detach_pid(post->pids, p, PIDTYPE_TGID);
+ detach_pid(post->pids, p, PIDTYPE_PGID);
+ detach_pid(post->pids, p, PIDTYPE_SID);
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
@@ -141,7 +150,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
/*
* This function expects the tasklist_lock write-locked.
*/
-static void __exit_signal(struct task_struct *tsk)
+static void __exit_signal(struct release_task_post *post, struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
@@ -174,9 +183,6 @@ static void __exit_signal(struct task_struct *tsk)
sig->curr_target = next_thread(tsk);
}
- add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
- sizeof(unsigned long long));
-
/*
* Accumulate here the counters for all threads as they die. We could
* skip the group leader because it is the last user of signal_struct,
@@ -197,23 +203,15 @@ static void __exit_signal(struct task_struct *tsk)
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
- __unhash_process(tsk, group_dead);
+ __unhash_process(post, tsk, group_dead);
write_sequnlock(&sig->stats_lock);
- /*
- * Do this under ->siglock, we can race with another thread
- * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
- */
- flush_sigqueue(&tsk->pending);
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
- clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
- if (group_dead) {
- flush_sigqueue(&sig->shared_pending);
+ if (group_dead)
tty_kref_put(tty);
- }
}
static void delayed_put_task_struct(struct rcu_head *rhp)
@@ -239,22 +237,27 @@ void __weak release_thread(struct task_struct *dead_task)
void release_task(struct task_struct *p)
{
+ struct release_task_post post;
struct task_struct *leader;
struct pid *thread_pid;
int zap_leader;
repeat:
+ memset(&post, 0, sizeof(post));
+
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
rcu_read_unlock();
+ pidfs_exit(p);
cgroup_release(p);
+ thread_pid = get_pid(p->thread_pid);
+
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
- thread_pid = get_pid(p->thread_pid);
- __exit_signal(p);
+ __exit_signal(&post, p);
/*
* If we are the last non-leader member of the thread
@@ -278,7 +281,20 @@ repeat:
write_unlock_irq(&tasklist_lock);
proc_flush_pid(thread_pid);
put_pid(thread_pid);
+ add_device_randomness(&p->se.sum_exec_runtime,
+ sizeof(p->se.sum_exec_runtime));
+ free_pids(post.pids);
release_thread(p);
+ /*
+ * This task was already removed from the process/thread/pid lists
+ * and lock_task_sighand(p) can't succeed. Nobody else can touch
+ * ->pending or, if group dead, signal->shared_pending. We can call
+ * flush_sigqueue() lockless.
+ */
+ flush_sigqueue(&p->pending);
+ if (thread_group_leader(p))
+ flush_sigqueue(&p->signal->shared_pending);
+
put_task_struct_rcu_user(p);
p = leader;
@@ -741,10 +757,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
tsk->exit_state = EXIT_ZOMBIE;
/*
- * sub-thread or delay_group_leader(), wake up the
- * PIDFD_THREAD waiters.
+ * Ignore thread-group leaders that exited before all
+ * subthreads did.
*/
- if (!thread_group_empty(tsk))
+ if (!delay_group_leader(tsk))
do_notify_pidfd(tsk);
if (unlikely(tsk->ptrace)) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 735405a9c5f3..1b659b07ecd5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -504,6 +504,10 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
vma_numab_state_init(new);
dup_anon_vma_name(orig, new);
+ /* track_pfn_copy() will later take care of copying internal state. */
+ if (unlikely(new->vm_flags & VM_PFNMAP))
+ untrack_pfn_clear(new);
+
return new;
}
@@ -1891,8 +1895,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
#ifdef CONFIG_POSIX_TIMERS
INIT_HLIST_HEAD(&sig->posix_timers);
INIT_HLIST_HEAD(&sig->ignored_posix_timers);
- hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- sig->real_timer.function = it_real_fn;
+ hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
#endif
task_lock(current->group_leader);
@@ -2032,25 +2035,18 @@ static inline void rcu_copy_process(struct task_struct *p)
*/
static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
- int pidfd;
struct file *pidfd_file;
- pidfd = get_unused_fd_flags(O_CLOEXEC);
+ CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
if (pidfd < 0)
return pidfd;
pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
- if (IS_ERR(pidfd_file)) {
- put_unused_fd(pidfd);
+ if (IS_ERR(pidfd_file))
return PTR_ERR(pidfd_file);
- }
- /*
- * anon_inode_getfile() ignores everything outside of the
- * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
- */
- pidfd_file->f_flags |= (flags & PIDFD_THREAD);
+
*ret = pidfd_file;
- return pidfd;
+ return take_fd(pidfd);
}
/**
@@ -2432,8 +2428,11 @@ __latent_entropy struct task_struct *copy_process(
if (clone_flags & CLONE_PIDFD) {
int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
- /* Note that no task has been attached to @pid yet. */
- retval = __pidfd_prepare(pid, flags, &pidfile);
+ /*
+ * Note that no task has been attached to @pid yet indicate
+ * that via CLONE_PIDFD.
+ */
+ retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 3db8567f5a44..cca15859a50b 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -50,10 +50,10 @@
*/
static struct {
struct futex_hash_bucket *queues;
- unsigned long hashsize;
+ unsigned long hashmask;
} __futex_data __read_mostly __aligned(2*sizeof(long));
#define futex_queues (__futex_data.queues)
-#define futex_hashsize (__futex_data.hashsize)
+#define futex_hashmask (__futex_data.hashmask)
/*
@@ -119,7 +119,7 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)
u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
key->both.offset);
- return &futex_queues[hash & (futex_hashsize - 1)];
+ return &futex_queues[hash & futex_hashmask];
}
@@ -1127,27 +1127,28 @@ void futex_exit_release(struct task_struct *tsk)
static int __init futex_init(void)
{
+ unsigned long hashsize, i;
unsigned int futex_shift;
- unsigned long i;
#ifdef CONFIG_BASE_SMALL
- futex_hashsize = 16;
+ hashsize = 16;
#else
- futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+ hashsize = roundup_pow_of_two(256 * num_possible_cpus());
#endif
futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- futex_hashsize, 0, 0,
+ hashsize, 0, 0,
&futex_shift, NULL,
- futex_hashsize, futex_hashsize);
- futex_hashsize = 1UL << futex_shift;
+ hashsize, hashsize);
+ hashsize = 1UL << futex_shift;
- for (i = 0; i < futex_hashsize; i++) {
+ for (i = 0; i < hashsize; i++) {
atomic_set(&futex_queues[i].waiters, 0);
plist_head_init(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}
+ futex_hashmask = hashsize - 1;
return 0;
}
core_initcall(futex_init);
diff --git a/kernel/iomem.c b/kernel/iomem.c
index dc2120776e1c..75e61c1c6bc0 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -6,7 +6,8 @@
#include <linux/ioremap.h>
#ifndef arch_memremap_wb
-static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
+static void *arch_memremap_wb(resource_size_t offset, unsigned long size,
+ unsigned long flags)
{
#ifdef ioremap_cache
return (__force void *)ioremap_cache(offset, size);
@@ -91,7 +92,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
if (is_ram == REGION_INTERSECTS)
addr = try_ram_remap(offset, size, flags);
if (!addr)
- addr = arch_memremap_wb(offset, size);
+ addr = arch_memremap_wb(offset, size, flags);
}
/*
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 875f25ed6f71..3f02a0e45254 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -47,10 +47,6 @@ config GENERIC_IRQ_INJECTION
config HARDIRQS_SW_RESEND
bool
-# Edge style eoi based handler (cell)
-config IRQ_EDGE_EOI_HANDLER
- bool
-
# Generic configurable interrupt chip implementation
config GENERIC_IRQ_CHIP
bool
@@ -96,6 +92,7 @@ config GENERIC_MSI_IRQ
bool
select IRQ_DOMAIN_HIERARCHY
+# irqchip drivers should select this if they call iommu_dma_prepare_msi()
config IRQ_MSI_IOMMU
bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c901436ebd9f..36cf1b09cc84 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -232,6 +232,21 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
}
#endif
+static void irq_enable(struct irq_desc *desc)
+{
+ if (!irqd_irq_disabled(&desc->irq_data)) {
+ unmask_irq(desc);
+ } else {
+ irq_state_clr_disabled(desc);
+ if (desc->irq_data.chip->irq_enable) {
+ desc->irq_data.chip->irq_enable(&desc->irq_data);
+ irq_state_clr_masked(desc);
+ } else {
+ unmask_irq(desc);
+ }
+ }
+}
+
static int __irq_startup(struct irq_desc *desc)
{
struct irq_data *d = irq_desc_get_irq_data(desc);
@@ -332,21 +347,6 @@ void irq_shutdown_and_deactivate(struct irq_desc *desc)
irq_domain_deactivate_irq(&desc->irq_data);
}
-void irq_enable(struct irq_desc *desc)
-{
- if (!irqd_irq_disabled(&desc->irq_data)) {
- unmask_irq(desc);
- } else {
- irq_state_clr_disabled(desc);
- if (desc->irq_data.chip->irq_enable) {
- desc->irq_data.chip->irq_enable(&desc->irq_data);
- irq_state_clr_masked(desc);
- } else {
- unmask_irq(desc);
- }
- }
-}
-
static void __irq_disable(struct irq_desc *desc, bool mask)
{
if (irqd_irq_disabled(&desc->irq_data)) {
@@ -838,53 +838,6 @@ out_unlock:
}
EXPORT_SYMBOL(handle_edge_irq);
-#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
-/**
- * handle_edge_eoi_irq - edge eoi type IRQ handler
- * @desc: the interrupt description structure for this irq
- *
- * Similar as the above handle_edge_irq, but using eoi and w/o the
- * mask/unmask logic.
- */
-void handle_edge_eoi_irq(struct irq_desc *desc)
-{
- struct irq_chip *chip = irq_desc_get_chip(desc);
-
- raw_spin_lock(&desc->lock);
-
- desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
-
- if (!irq_may_run(desc)) {
- desc->istate |= IRQS_PENDING;
- goto out_eoi;
- }
-
- /*
- * If its disabled or no action available then mask it and get
- * out of here.
- */
- if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
- desc->istate |= IRQS_PENDING;
- goto out_eoi;
- }
-
- kstat_incr_irqs_this_cpu(desc);
-
- do {
- if (unlikely(!desc->action))
- goto out_eoi;
-
- handle_irq_event(desc);
-
- } while ((desc->istate & IRQS_PENDING) &&
- !irqd_irq_disabled(&desc->irq_data));
-
-out_eoi:
- chip->irq_eoi(&desc->irq_data);
- raw_spin_unlock(&desc->lock);
-}
-#endif
-
/**
* handle_percpu_irq - Per CPU local irq handler
* @desc: the interrupt description structure for this irq
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a979523640d0..b0290849c395 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -90,7 +90,6 @@ extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
extern void irq_shutdown(struct irq_desc *desc);
extern void irq_shutdown_and_deactivate(struct irq_desc *desc);
-extern void irq_enable(struct irq_desc *desc);
extern void irq_disable(struct irq_desc *desc);
extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
@@ -98,18 +97,12 @@ extern void mask_irq(struct irq_desc *desc);
extern void unmask_irq(struct irq_desc *desc);
extern void unmask_threaded_irq(struct irq_desc *desc);
-extern unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask);
-
#ifdef CONFIG_SPARSE_IRQ
static inline void irq_mark_irq(unsigned int irq) { }
#else
extern void irq_mark_irq(unsigned int irq);
#endif
-extern int __irq_get_irqchip_state(struct irq_data *data,
- enum irqchip_irq_state which,
- bool *state);
-
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
@@ -139,8 +132,6 @@ static inline void unregister_handler_proc(unsigned int irq,
extern bool irq_can_set_affinity_usr(unsigned int irq);
-extern void irq_set_thread_affinity(struct irq_desc *desc);
-
extern int irq_do_set_affinity(struct irq_data *data,
const struct cpumask *dest, bool force);
@@ -442,6 +433,7 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
return desc->pending_mask;
}
bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
+void irq_force_complete_move(struct irq_desc *desc);
#else /* CONFIG_GENERIC_PENDING_IRQ */
static inline bool irq_can_move_pcntxt(struct irq_data *data)
{
@@ -467,6 +459,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
{
return false;
}
+static inline void irq_force_complete_move(struct irq_desc *desc) { }
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 287830739783..4258cd6bd3b4 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -991,7 +991,7 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
}
-unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask)
+static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask)
{
unsigned int sum = 0;
int cpu;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index ec6d8e72d980..2861f89880af 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1589,9 +1589,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain,
}
}
-int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
- unsigned int irq_base,
- unsigned int nr_irqs, void *arg)
+static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base,
+ unsigned int nr_irqs, void *arg)
{
if (!domain->ops->alloc) {
pr_debug("domain->ops->alloc() is NULL\n");
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f300bb6be3bd..753eef8e041c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -35,6 +35,8 @@ static int __init setup_forced_irqthreads(char *arg)
early_param("threadirqs", setup_forced_irqthreads);
#endif
+static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state);
+
static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
{
struct irq_data *irqd = irq_desc_get_irq_data(desc);
@@ -187,7 +189,7 @@ bool irq_can_set_affinity_usr(unsigned int irq)
* set_cpus_allowed_ptr() here as we hold desc->lock and this
* code can be called from hard interrupt context.
*/
-void irq_set_thread_affinity(struct irq_desc *desc)
+static void irq_set_thread_affinity(struct irq_desc *desc)
{
struct irqaction *action;
@@ -2789,8 +2791,7 @@ out:
irq_put_desc_unlock(desc, flags);
}
-int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
- bool *state)
+static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state)
{
struct irq_chip *chip;
int err = -EINVAL;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index eb150afd671f..147cabb4c077 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -35,6 +35,16 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear)
return true;
}
+void irq_force_complete_move(struct irq_desc *desc)
+{
+ for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = d->parent_data) {
+ if (d->chip && d->chip->irq_force_complete_move) {
+ d->chip->irq_force_complete_move(d);
+ return;
+ }
+ }
+}
+
void irq_move_masked_irq(struct irq_data *idata)
{
struct irq_desc *desc = irq_data_to_desc(idata);
@@ -117,3 +127,13 @@ void __irq_move_irq(struct irq_data *idata)
if (!masked)
idata->chip->irq_unmask(idata);
}
+
+bool irq_can_move_in_process_context(struct irq_data *data)
+{
+ /*
+ * Get the top level irq_data in the hierarchy, which is optimized
+ * away when CONFIG_IRQ_DOMAIN_HIERARCHY is disabled.
+ */
+ data = irq_desc_get_irq_data(irq_data_to_desc(data));
+ return irq_can_move_pcntxt(data);
+}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 396a067a8a56..5c8d43cdb0a3 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -15,6 +15,7 @@
#include <linux/mutex.h>
#include <linux/pci.h>
#include <linux/slab.h>
+#include <linux/seq_file.h>
#include <linux/sysfs.h>
#include <linux/types.h>
#include <linux/xarray.h>
@@ -756,12 +757,30 @@ static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fw
return info->ops->msi_translate(domain, fwspec, hwirq, type);
}
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d,
+ struct irq_data *irqd, int ind)
+{
+ struct msi_desc *desc = irq_data_get_msi_desc(irqd);
+
+ if (!desc)
+ return;
+
+ seq_printf(m, "\n%*saddress_hi: 0x%08x", ind + 1, "", desc->msg.address_hi);
+ seq_printf(m, "\n%*saddress_lo: 0x%08x", ind + 1, "", desc->msg.address_lo);
+ seq_printf(m, "\n%*smsg_data: 0x%08x\n", ind + 1, "", desc->msg.data);
+}
+#endif
+
static const struct irq_domain_ops msi_domain_ops = {
.alloc = msi_domain_alloc,
.free = msi_domain_free,
.activate = msi_domain_activate,
.deactivate = msi_domain_deactivate,
.translate = msi_domain_translate,
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+ .debug_show = msi_domain_debug_show,
+#endif
};
static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
@@ -1143,7 +1162,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
return false;
- if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
+ if (info->flags & MSI_FLAG_NO_MASK)
return false;
/*
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 93a822d3c468..7cb19e601426 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -653,13 +653,12 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
struct module *mod;
int ret;
- preempt_disable();
- mod = __module_text_address((unsigned long)start);
- WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
- if (!try_module_get(mod))
- mod = NULL;
- preempt_enable();
-
+ scoped_guard(rcu) {
+ mod = __module_text_address((unsigned long)start);
+ WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+ if (!try_module_get(mod))
+ mod = NULL;
+ }
if (!mod)
return 0;
@@ -746,9 +745,9 @@ static int jump_label_add_module(struct module *mod)
kfree(jlm);
return -ENOMEM;
}
- preempt_disable();
- jlm2->mod = __module_address((unsigned long)key);
- preempt_enable();
+ scoped_guard(rcu)
+ jlm2->mod = __module_address((unsigned long)key);
+
jlm2->entries = static_key_entries(key);
jlm2->next = NULL;
static_key_set_mod(key, jlm2);
@@ -906,13 +905,13 @@ static void jump_label_update(struct static_key *key)
return;
}
- preempt_disable();
- mod = __module_address((unsigned long)key);
- if (mod) {
- stop = mod->jump_entries + mod->num_jump_entries;
- init = mod->state == MODULE_STATE_COMING;
+ scoped_guard(rcu) {
+ mod = __module_address((unsigned long)key);
+ if (mod) {
+ stop = mod->jump_entries + mod->num_jump_entries;
+ init = mod->state == MODULE_STATE_COMING;
+ }
}
- preempt_enable();
#endif
entry = static_key_entries(key);
/* if there are no users, entry can be NULL */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index a9a0ca605d4a..4198f30aac3c 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -148,16 +148,8 @@ static unsigned int get_symbol_offset(unsigned long pos)
unsigned long kallsyms_sym_address(int idx)
{
- /* values are unsigned offsets if --absolute-percpu is not in effect */
- if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU))
- return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
-
- /* ...otherwise, positive offsets are absolute values */
- if (kallsyms_offsets[idx] >= 0)
- return kallsyms_offsets[idx];
-
- /* ...and negative offsets are relative to kallsyms_relative_base - 1 */
- return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
+ /* values are unsigned offsets */
+ return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
}
static unsigned int get_symbol_seq(int index)
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 2c596851f8a9..7c1a65bd5f8d 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -145,7 +145,7 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
*/
task1 = find_task_by_vpid(pid1);
task2 = find_task_by_vpid(pid2);
- if (!task1 || !task2)
+ if (unlikely(!task1 || !task2))
goto err_no_task;
get_task_struct(task1);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index c0bdc1686154..c22ad51c4317 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1013,7 +1013,7 @@ int kernel_kexec(void)
error = -EBUSY;
goto Restore_console;
}
- suspend_console();
+ console_suspend_all();
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
goto Resume_console;
@@ -1072,7 +1072,7 @@ int kernel_kexec(void)
Resume_devices:
dpm_resume_end(PMSG_RESTORE);
Resume_console:
- resume_console();
+ console_resume_all();
thaw_processes();
Restore_console:
pm_restore_console();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 88aeac84e4c0..ffe0c3d52306 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1547,7 +1547,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
/* Ensure the address is in a text area, and find a module if exists. */
*probed_mod = NULL;
if (!core_kernel_text((unsigned long) p->addr)) {
- guard(preempt)();
+ guard(rcu)();
*probed_mod = __module_text_address((unsigned long) p->addr);
if (!(*probed_mod))
return -EINVAL;
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 0cd39954d5a1..0e73fac55f8e 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -59,7 +59,7 @@ static void klp_find_object_module(struct klp_object *obj)
if (!klp_is_module(obj))
return;
- rcu_read_lock_sched();
+ guard(rcu)();
/*
* We do not want to block removal of patched modules and therefore
* we do not take a reference here. The patches are removed by
@@ -75,8 +75,6 @@ static void klp_find_object_module(struct klp_object *obj)
*/
if (mod && mod->klp_alive)
obj->mod = mod;
-
- rcu_read_unlock_sched();
}
static bool klp_initialized(void)
@@ -601,9 +599,12 @@ static int klp_add_object_nops(struct klp_patch *patch,
}
/*
- * Add 'nop' functions which simply return to the caller to run
- * the original function. The 'nop' functions are added to a
- * patch to facilitate a 'replace' mode.
+ * Add 'nop' functions which simply return to the caller to run the
+ * original function.
+ *
+ * They are added only when the atomic replace mode is used and only for
+ * functions which are currently livepatched but are no longer included
+ * in the new livepatch.
*/
static int klp_add_nops(struct klp_patch *patch)
{
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 0db4093d17b8..a114949eeed5 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -5,7 +5,8 @@ KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
-# Avoid recursion lockdep -> sanitizer -> ... -> lockdep.
+# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance.
+KASAN_SANITIZE_lockdep.o := n
KCSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
index 97fb6f3f840a..4e36258cc34f 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -50,6 +50,11 @@ LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */
#endif /* CONFIG_QUEUED_SPINLOCKS */
/*
+ * Locking events for Resilient Queued Spin Lock
+ */
+LOCK_EVENT(rqspinlock_lock_timeout) /* # of locking ops that timeout */
+
+/*
* Locking events for rwsem
*/
LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
@@ -67,3 +72,31 @@ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */
+
+/*
+ * Locking events for rtlock_slowlock()
+ */
+LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */
+LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */
+LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */
+LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */
+LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */
+
+/*
+ * Locking events for rt_mutex_slowlock()
+ */
+LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */
+LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */
+LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */
+LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */
+LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */
+LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */
+LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */
+LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */
+
+/*
+ * Locking events for lockdep
+ */
+LOCK_EVENT(lockdep_acquire)
+LOCK_EVENT(lockdep_lock)
+LOCK_EVENT(lockdep_nocheck)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4470680f0226..58d78a33ac65 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -57,10 +57,12 @@
#include <linux/lockdep.h>
#include <linux/context_tracking.h>
#include <linux/console.h>
+#include <linux/kasan.h>
#include <asm/sections.h>
#include "lockdep_internals.h"
+#include "lock_events.h"
#include <trace/events/lock.h>
@@ -170,6 +172,7 @@ static struct task_struct *lockdep_selftest_task_struct;
static int graph_lock(void)
{
lockdep_lock();
+ lockevent_inc(lockdep_lock);
/*
* Make sure that if another CPU detected a bug while
* walking the graph we dont change it (while the other
@@ -5091,8 +5094,12 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (unlikely(lock->key == &__lockdep_no_track__))
return 0;
- if (!prove_locking || lock->key == &__lockdep_no_validate__)
+ lockevent_inc(lockdep_acquire);
+
+ if (!prove_locking || lock->key == &__lockdep_no_validate__) {
check = 0;
+ lockevent_inc(lockdep_nocheck);
+ }
if (subclass < NR_LOCKDEP_CACHING_CLASSES)
class = lock->class_cache[subclass];
@@ -5824,6 +5831,14 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!debug_locks)
return;
+ /*
+ * As KASAN instrumentation is disabled and lock_acquire() is usually
+ * the first lockdep call when a task tries to acquire a lock, add
+ * kasan_check_byte() here to check for use-after-free and other
+ * memory errors.
+ */
+ kasan_check_byte(lock);
+
if (unlikely(!lockdep_enabled())) {
/* XXX allow trylock from NMI ?!? */
if (lockdep_nmi() && !trylock) {
@@ -6249,6 +6264,9 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
hlist_del_rcu(&class->hash_entry);
WRITE_ONCE(class->key, NULL);
WRITE_ONCE(class->name, NULL);
+ /* Class allocated but not used, -1 in nr_unused_locks */
+ if (class->usage_mask == 0)
+ debug_atomic_dec(nr_unused_locks);
nr_lock_classes--;
__clear_bit(class - lock_classes, lock_classes_in_use);
if (class - lock_classes == max_lock_class_idx)
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index cc33470f4de9..ce0362f0a871 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -362,6 +362,60 @@ static struct lock_torture_ops raw_spin_lock_irq_ops = {
.name = "raw_spin_lock_irq"
};
+#ifdef CONFIG_BPF_SYSCALL
+
+#include <asm/rqspinlock.h>
+static rqspinlock_t rqspinlock;
+
+static int torture_raw_res_spin_write_lock(int tid __maybe_unused)
+{
+ raw_res_spin_lock(&rqspinlock);
+ return 0;
+}
+
+static void torture_raw_res_spin_write_unlock(int tid __maybe_unused)
+{
+ raw_res_spin_unlock(&rqspinlock);
+}
+
+static struct lock_torture_ops raw_res_spin_lock_ops = {
+ .writelock = torture_raw_res_spin_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_res_spin_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_res_spin_lock"
+};
+
+static int torture_raw_res_spin_write_lock_irq(int tid __maybe_unused)
+{
+ unsigned long flags;
+
+ raw_res_spin_lock_irqsave(&rqspinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_res_spin_write_unlock_irq(int tid __maybe_unused)
+{
+ raw_res_spin_unlock_irqrestore(&rqspinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_res_spin_lock_irq_ops = {
+ .writelock = torture_raw_res_spin_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_res_spin_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_res_spin_lock_irq"
+};
+
+#endif
+
static DEFINE_RWLOCK(torture_rwlock);
static int torture_rwlock_write_lock(int tid __maybe_unused)
@@ -1168,6 +1222,9 @@ static int __init lock_torture_init(void)
&lock_busted_ops,
&spin_lock_ops, &spin_lock_irq_ops,
&raw_spin_lock_ops, &raw_spin_lock_irq_ops,
+#ifdef CONFIG_BPF_SYSCALL
+ &raw_res_spin_lock_ops, &raw_res_spin_lock_irq_ops,
+#endif
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
&ww_mutex_lock_ops,
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 85251d8771d9..5c92ba199b90 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -15,12 +15,6 @@
#include <asm/mcs_spinlock.h>
-struct mcs_spinlock {
- struct mcs_spinlock *next;
- int locked; /* 1 if lock acquired */
- int count; /* nesting count, see qspinlock.c */
-};
-
#ifndef arch_mcs_spin_lock_contended
/*
* Using smp_cond_load_acquire() provides the acquire semantics
@@ -30,9 +24,7 @@ struct mcs_spinlock {
* spinning, and smp_cond_load_acquire() provides that behavior.
*/
#define arch_mcs_spin_lock_contended(l) \
-do { \
- smp_cond_load_acquire(l, VAL); \
-} while (0)
+ smp_cond_load_acquire(l, VAL)
#endif
#ifndef arch_mcs_spin_unlock_contended
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index b36f23de48f1..19b636f60a24 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -143,6 +143,8 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
unsigned long curr = (unsigned long)current;
unsigned long zero = 0UL;
+ MUTEX_WARN_ON(lock->magic != lock);
+
if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
return true;
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 7d96bed718e4..af8d122bb649 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -25,8 +25,9 @@
#include <trace/events/lock.h>
/*
- * Include queued spinlock statistics code
+ * Include queued spinlock definitions and statistics code
*/
+#include "qspinlock.h"
#include "qspinlock_stat.h"
/*
@@ -67,36 +68,6 @@
*/
#include "mcs_spinlock.h"
-#define MAX_NODES 4
-
-/*
- * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
- * size and four of them will fit nicely in one 64-byte cacheline. For
- * pvqspinlock, however, we need more space for extra data. To accommodate
- * that, we insert two more long words to pad it up to 32 bytes. IOW, only
- * two of them can fit in a cacheline in this case. That is OK as it is rare
- * to have more than 2 levels of slowpath nesting in actual use. We don't
- * want to penalize pvqspinlocks to optimize for a rare case in native
- * qspinlocks.
- */
-struct qnode {
- struct mcs_spinlock mcs;
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
- long reserved[2];
-#endif
-};
-
-/*
- * The pending bit spinning loop count.
- * This heuristic is used to limit the number of lockword accesses
- * made by atomic_cond_read_relaxed when waiting for the lock to
- * transition out of the "== _Q_PENDING_VAL" state. We don't spin
- * indefinitely because there's no guarantee that we'll make forward
- * progress.
- */
-#ifndef _Q_PENDING_LOOPS
-#define _Q_PENDING_LOOPS 1
-#endif
/*
* Per-CPU queue node structures; we can never have more than 4 nested
@@ -106,161 +77,7 @@ struct qnode {
*
* PV doubles the storage and uses the second cacheline for PV state.
*/
-static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);
-
-/*
- * We must be able to distinguish between no-tail and the tail at 0:0,
- * therefore increment the cpu number by one.
- */
-
-static inline __pure u32 encode_tail(int cpu, int idx)
-{
- u32 tail;
-
- tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
- tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
-
- return tail;
-}
-
-static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
-{
- int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
- int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
-
- return per_cpu_ptr(&qnodes[idx].mcs, cpu);
-}
-
-static inline __pure
-struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx)
-{
- return &((struct qnode *)base + idx)->mcs;
-}
-
-#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
-
-#if _Q_PENDING_BITS == 8
-/**
- * clear_pending - clear the pending bit.
- * @lock: Pointer to queued spinlock structure
- *
- * *,1,* -> *,0,*
- */
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
- WRITE_ONCE(lock->pending, 0);
-}
-
-/**
- * clear_pending_set_locked - take ownership and clear the pending bit.
- * @lock: Pointer to queued spinlock structure
- *
- * *,1,0 -> *,0,1
- *
- * Lock stealing is not allowed if this function is used.
- */
-static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
-{
- WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
-}
-
-/*
- * xchg_tail - Put in the new queue tail code word & retrieve previous one
- * @lock : Pointer to queued spinlock structure
- * @tail : The new queue tail code word
- * Return: The previous queue tail code word
- *
- * xchg(lock, tail), which heads an address dependency
- *
- * p,*,* -> n,*,* ; prev = xchg(lock, node)
- */
-static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
-{
- /*
- * We can use relaxed semantics since the caller ensures that the
- * MCS node is properly initialized before updating the tail.
- */
- return (u32)xchg_relaxed(&lock->tail,
- tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
-}
-
-#else /* _Q_PENDING_BITS == 8 */
-
-/**
- * clear_pending - clear the pending bit.
- * @lock: Pointer to queued spinlock structure
- *
- * *,1,* -> *,0,*
- */
-static __always_inline void clear_pending(struct qspinlock *lock)
-{
- atomic_andnot(_Q_PENDING_VAL, &lock->val);
-}
-
-/**
- * clear_pending_set_locked - take ownership and clear the pending bit.
- * @lock: Pointer to queued spinlock structure
- *
- * *,1,0 -> *,0,1
- */
-static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
-{
- atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
-}
-
-/**
- * xchg_tail - Put in the new queue tail code word & retrieve previous one
- * @lock : Pointer to queued spinlock structure
- * @tail : The new queue tail code word
- * Return: The previous queue tail code word
- *
- * xchg(lock, tail)
- *
- * p,*,* -> n,*,* ; prev = xchg(lock, node)
- */
-static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
-{
- u32 old, new;
-
- old = atomic_read(&lock->val);
- do {
- new = (old & _Q_LOCKED_PENDING_MASK) | tail;
- /*
- * We can use relaxed semantics since the caller ensures that
- * the MCS node is properly initialized before updating the
- * tail.
- */
- } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
-
- return old;
-}
-#endif /* _Q_PENDING_BITS == 8 */
-
-/**
- * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
- * @lock : Pointer to queued spinlock structure
- * Return: The previous lock value
- *
- * *,*,* -> *,1,*
- */
-#ifndef queued_fetch_set_pending_acquire
-static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
-{
- return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
-}
-#endif
-
-/**
- * set_locked - Set the lock bit and own the lock
- * @lock: Pointer to queued spinlock structure
- *
- * *,*,0 -> *,0,1
- */
-static __always_inline void set_locked(struct qspinlock *lock)
-{
- WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
-}
-
+static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[_Q_MAX_NODES]);
/*
* Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
@@ -410,7 +227,7 @@ pv_queue:
* any MCS node. This is not the most elegant solution, but is
* simple enough.
*/
- if (unlikely(idx >= MAX_NODES)) {
+ if (unlikely(idx >= _Q_MAX_NODES)) {
lockevent_inc(lock_no_node);
while (!queued_spin_trylock(lock))
cpu_relax();
@@ -465,7 +282,7 @@ pv_queue:
* head of the waitqueue.
*/
if (old & _Q_TAIL_MASK) {
- prev = decode_tail(old);
+ prev = decode_tail(old, qnodes);
/* Link @node into the waitqueue. */
WRITE_ONCE(prev->next, node);
diff --git a/kernel/locking/qspinlock.h b/kernel/locking/qspinlock.h
new file mode 100644
index 000000000000..d69958a844f7
--- /dev/null
+++ b/kernel/locking/qspinlock.h
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Queued spinlock defines
+ *
+ * This file contains macro definitions and functions shared between different
+ * qspinlock slow path implementations.
+ */
+#ifndef __LINUX_QSPINLOCK_H
+#define __LINUX_QSPINLOCK_H
+
+#include <asm-generic/percpu.h>
+#include <linux/percpu-defs.h>
+#include <asm-generic/qspinlock.h>
+#include <asm-generic/mcs_spinlock.h>
+
+#define _Q_MAX_NODES 4
+
+/*
+ * The pending bit spinning loop count.
+ * This heuristic is used to limit the number of lockword accesses
+ * made by atomic_cond_read_relaxed when waiting for the lock to
+ * transition out of the "== _Q_PENDING_VAL" state. We don't spin
+ * indefinitely because there's no guarantee that we'll make forward
+ * progress.
+ */
+#ifndef _Q_PENDING_LOOPS
+#define _Q_PENDING_LOOPS 1
+#endif
+
+/*
+ * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
+ * size and four of them will fit nicely in one 64-byte cacheline. For
+ * pvqspinlock, however, we need more space for extra data. To accommodate
+ * that, we insert two more long words to pad it up to 32 bytes. IOW, only
+ * two of them can fit in a cacheline in this case. That is OK as it is rare
+ * to have more than 2 levels of slowpath nesting in actual use. We don't
+ * want to penalize pvqspinlocks to optimize for a rare case in native
+ * qspinlocks.
+ */
+struct qnode {
+ struct mcs_spinlock mcs;
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+ long reserved[2];
+#endif
+};
+
+/*
+ * We must be able to distinguish between no-tail and the tail at 0:0,
+ * therefore increment the cpu number by one.
+ */
+
+static inline __pure u32 encode_tail(int cpu, int idx)
+{
+ u32 tail;
+
+ tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+ tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+ return tail;
+}
+
+static inline __pure struct mcs_spinlock *decode_tail(u32 tail,
+ struct qnode __percpu *qnodes)
+{
+ int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+ int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+ return per_cpu_ptr(&qnodes[idx].mcs, cpu);
+}
+
+static inline __pure
+struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx)
+{
+ return &((struct qnode *)base + idx)->mcs;
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+#if _Q_PENDING_BITS == 8
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->pending, 0);
+}
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail), which heads an address dependency
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ /*
+ * We can use relaxed semantics since the caller ensures that the
+ * MCS node is properly initialized before updating the tail.
+ */
+ return (u32)xchg_relaxed(&lock->tail,
+ tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+ atomic_andnot(_Q_PENDING_VAL, &lock->val);
+}
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+ atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+ u32 old, new;
+
+ old = atomic_read(&lock->val);
+ do {
+ new = (old & _Q_LOCKED_PENDING_MASK) | tail;
+ /*
+ * We can use relaxed semantics since the caller ensures that
+ * the MCS node is properly initialized before updating the
+ * tail.
+ */
+ } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
+
+ return old;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
+/**
+ * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
+ * @lock : Pointer to queued spinlock structure
+ * Return: The previous lock value
+ *
+ * *,*,* -> *,1,*
+ */
+#ifndef queued_fetch_set_pending_acquire
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
+{
+ return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
+}
+#endif
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+ WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
+}
+
+#endif /* __LINUX_QSPINLOCK_H */
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 4a8df1800cbb..c80902eacd79 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -27,6 +27,7 @@
#include <trace/events/lock.h>
#include "rtmutex_common.h"
+#include "lock_events.h"
#ifndef WW_RT
# define build_ww_mutex() (false)
@@ -1612,10 +1613,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
struct task_struct *owner;
int ret = 0;
+ lockevent_inc(rtmutex_slow_block);
for (;;) {
/* Try to acquire the lock: */
- if (try_to_take_rt_mutex(lock, current, waiter))
+ if (try_to_take_rt_mutex(lock, current, waiter)) {
+ lockevent_inc(rtmutex_slow_acq3);
break;
+ }
if (timeout && !timeout->task) {
ret = -ETIMEDOUT;
@@ -1638,8 +1642,10 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
owner = NULL;
raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
- if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
+ if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) {
+ lockevent_inc(rtmutex_slow_sleep);
rt_mutex_schedule();
+ }
raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
@@ -1694,6 +1700,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
int ret;
lockdep_assert_held(&lock->wait_lock);
+ lockevent_inc(rtmutex_slowlock);
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
@@ -1701,6 +1708,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
__ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
ww_mutex_lock_acquired(ww, ww_ctx);
}
+ lockevent_inc(rtmutex_slow_acq1);
return 0;
}
@@ -1719,10 +1727,12 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
__ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
ww_mutex_lock_acquired(ww, ww_ctx);
}
+ lockevent_inc(rtmutex_slow_acq2);
} else {
__set_current_state(TASK_RUNNING);
remove_waiter(lock, waiter);
rt_mutex_handle_deadlock(ret, chwalk, lock, waiter);
+ lockevent_inc(rtmutex_deadlock);
}
/*
@@ -1751,6 +1761,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
&waiter, wake_q);
debug_rt_mutex_free_waiter(&waiter);
+ lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q));
return ret;
}
@@ -1823,9 +1834,12 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
struct task_struct *owner;
lockdep_assert_held(&lock->wait_lock);
+ lockevent_inc(rtlock_slowlock);
- if (try_to_take_rt_mutex(lock, current, NULL))
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
+ lockevent_inc(rtlock_slow_acq1);
return;
+ }
rt_mutex_init_rtlock_waiter(&waiter);
@@ -1838,8 +1852,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
for (;;) {
/* Try to acquire the lock again */
- if (try_to_take_rt_mutex(lock, current, &waiter))
+ if (try_to_take_rt_mutex(lock, current, &waiter)) {
+ lockevent_inc(rtlock_slow_acq2);
break;
+ }
if (&waiter == rt_mutex_top_waiter(lock))
owner = rt_mutex_owner(lock);
@@ -1847,8 +1863,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
owner = NULL;
raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
- if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
+ if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) {
+ lockevent_inc(rtlock_slow_sleep);
schedule_rtlock();
+ }
raw_spin_lock_irq(&lock->wait_lock);
set_current_state(TASK_RTLOCK_WAIT);
@@ -1865,6 +1883,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
debug_rt_mutex_free_waiter(&waiter);
trace_contention_end(lock, 0);
+ lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q));
}
static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index d09b46ef032f..626cf8668a7e 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -124,17 +124,6 @@ char *module_next_tag_pair(char *string, unsigned long *secsize);
#define for_each_modinfo_entry(entry, info, name) \
for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry))
-static inline void module_assert_mutex_or_preempt(void)
-{
-#ifdef CONFIG_LOCKDEP
- if (unlikely(!debug_locks))
- return;
-
- WARN_ON_ONCE(!rcu_read_lock_sched_held() &&
- !lockdep_is_held(&module_mutex));
-#endif
-}
-
static inline unsigned long kernel_symbol_value(const struct kernel_symbol *sym)
{
#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index bf65e0c3c86f..00a60796327c 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -177,19 +177,15 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
unsigned long strtab_size;
void *data_base = mod->mem[MOD_DATA].base;
void *init_data_base = mod->mem[MOD_INIT_DATA].base;
+ struct mod_kallsyms *kallsyms;
- /* Set up to point into init section. */
- mod->kallsyms = (void __rcu *)init_data_base +
- info->mod_kallsyms_init_off;
+ kallsyms = init_data_base + info->mod_kallsyms_init_off;
- rcu_read_lock();
- /* The following is safe since this pointer cannot change */
- rcu_dereference(mod->kallsyms)->symtab = (void *)symsec->sh_addr;
- rcu_dereference(mod->kallsyms)->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ kallsyms->symtab = (void *)symsec->sh_addr;
+ kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
/* Make sure we get permanent strtab: don't use info->strtab. */
- rcu_dereference(mod->kallsyms)->strtab =
- (void *)info->sechdrs[info->index.str].sh_addr;
- rcu_dereference(mod->kallsyms)->typetab = init_data_base + info->init_typeoffs;
+ kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+ kallsyms->typetab = init_data_base + info->init_typeoffs;
/*
* Now populate the cut down core kallsyms for after init
@@ -199,20 +195,19 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
mod->core_kallsyms.strtab = s = data_base + info->stroffs;
mod->core_kallsyms.typetab = data_base + info->core_typeoffs;
strtab_size = info->core_typeoffs - info->stroffs;
- src = rcu_dereference(mod->kallsyms)->symtab;
- for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) {
- rcu_dereference(mod->kallsyms)->typetab[i] = elf_type(src + i, info);
+ src = kallsyms->symtab;
+ for (ndst = i = 0; i < kallsyms->num_symtab; i++) {
+ kallsyms->typetab[i] = elf_type(src + i, info);
if (i == 0 || is_livepatch_module(mod) ||
is_core_symbol(src + i, info->sechdrs, info->hdr->e_shnum,
info->index.pcpu)) {
ssize_t ret;
mod->core_kallsyms.typetab[ndst] =
- rcu_dereference(mod->kallsyms)->typetab[i];
+ kallsyms->typetab[i];
dst[ndst] = src[i];
dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
- ret = strscpy(s,
- &rcu_dereference(mod->kallsyms)->strtab[src[i].st_name],
+ ret = strscpy(s, &kallsyms->strtab[src[i].st_name],
strtab_size);
if (ret < 0)
break;
@@ -220,7 +215,9 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
strtab_size -= ret + 1;
}
}
- rcu_read_unlock();
+
+ /* Set up to point into init section. */
+ rcu_assign_pointer(mod->kallsyms, kallsyms);
mod->core_kallsyms.num_symtab = ndst;
}
@@ -260,7 +257,7 @@ static const char *find_kallsyms_symbol(struct module *mod,
{
unsigned int i, best = 0;
unsigned long nextval, bestval;
- struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+ struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms);
struct module_memory *mod_mem;
/* At worse, next value is at end of module */
@@ -319,7 +316,7 @@ void * __weak dereference_module_function_descriptor(struct module *mod,
/*
* For kallsyms to ask for address resolution. NULL means not found. Careful
- * not to lock to avoid deadlock on oopses, simply disable preemption.
+ * not to lock to avoid deadlock on oopses, RCU is enough.
*/
int module_address_lookup(unsigned long addr,
unsigned long *size,
@@ -332,7 +329,7 @@ int module_address_lookup(unsigned long addr,
int ret = 0;
struct module *mod;
- preempt_disable();
+ guard(rcu)();
mod = __module_address(addr);
if (mod) {
if (modname)
@@ -350,8 +347,6 @@ int module_address_lookup(unsigned long addr,
if (sym)
ret = strscpy(namebuf, sym, KSYM_NAME_LEN);
}
- preempt_enable();
-
return ret;
}
@@ -359,7 +354,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
{
struct module *mod;
- preempt_disable();
+ guard(rcu)();
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
@@ -371,12 +366,10 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
goto out;
strscpy(symname, sym, KSYM_NAME_LEN);
- preempt_enable();
return 0;
}
}
out:
- preempt_enable();
return -ERANGE;
}
@@ -385,13 +378,13 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
{
struct module *mod;
- preempt_disable();
+ guard(rcu)();
list_for_each_entry_rcu(mod, &modules, list) {
struct mod_kallsyms *kallsyms;
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- kallsyms = rcu_dereference_sched(mod->kallsyms);
+ kallsyms = rcu_dereference(mod->kallsyms);
if (symnum < kallsyms->num_symtab) {
const Elf_Sym *sym = &kallsyms->symtab[symnum];
@@ -400,12 +393,10 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
strscpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN);
strscpy(module_name, mod->name, MODULE_NAME_LEN);
*exported = is_exported(name, *value, mod);
- preempt_enable();
return 0;
}
symnum -= kallsyms->num_symtab;
}
- preempt_enable();
return -ERANGE;
}
@@ -413,7 +404,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
{
unsigned int i;
- struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+ struct mod_kallsyms *kallsyms = rcu_dereference(mod->kallsyms);
for (i = 0; i < kallsyms->num_symtab; i++) {
const Elf_Sym *sym = &kallsyms->symtab[i];
@@ -453,23 +444,15 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
/* Look for this name: can be of form module:name. */
unsigned long module_kallsyms_lookup_name(const char *name)
{
- unsigned long ret;
-
/* Don't lock: we're in enough trouble already. */
- preempt_disable();
- ret = __module_kallsyms_lookup_name(name);
- preempt_enable();
- return ret;
+ guard(rcu)();
+ return __module_kallsyms_lookup_name(name);
}
unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
{
- unsigned long ret;
-
- preempt_disable();
- ret = __find_kallsyms_symbol_value(mod, name);
- preempt_enable();
- return ret;
+ guard(rcu)();
+ return __find_kallsyms_symbol_value(mod, name);
}
int module_kallsyms_on_each_symbol(const char *modname,
@@ -490,10 +473,8 @@ int module_kallsyms_on_each_symbol(const char *modname,
if (modname && strcmp(modname, mod->name))
continue;
- /* Use rcu_dereference_sched() to remain compliant with the sparse tool */
- preempt_disable();
- kallsyms = rcu_dereference_sched(mod->kallsyms);
- preempt_enable();
+ kallsyms = rcu_dereference_check(mod->kallsyms,
+ lockdep_is_held(&module_mutex));
for (i = 0; i < kallsyms->num_symtab; i++) {
const Elf_Sym *sym = &kallsyms->symtab[i];
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 1fb9ad289a6f..a2859dc3eea6 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -67,7 +67,7 @@
/*
* Mutex protects:
- * 1) List of modules (also safely readable with preempt_disable),
+ * 1) List of modules (also safely readable within RCU read section),
* 2) module_use links,
* 3) mod_tree.addr_min/mod_tree.addr_max.
* (delete and add uses RCU list operations).
@@ -331,7 +331,7 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms,
/*
* Find an exported symbol and return it, along with, (optional) crc and
- * (optional) module which owns it. Needs preempt disabled or module_mutex.
+ * (optional) module which owns it. Needs RCU or module_mutex.
*/
bool find_symbol(struct find_symbol_arg *fsa)
{
@@ -345,8 +345,6 @@ bool find_symbol(struct find_symbol_arg *fsa)
struct module *mod;
unsigned int i;
- module_assert_mutex_or_preempt();
-
for (i = 0; i < ARRAY_SIZE(arr); i++)
if (find_exported_symbol_in_section(&arr[i], NULL, fsa))
return true;
@@ -374,16 +372,14 @@ bool find_symbol(struct find_symbol_arg *fsa)
}
/*
- * Search for module by name: must hold module_mutex (or preempt disabled
- * for read-only access).
+ * Search for module by name: must hold module_mutex (or RCU for read-only
+ * access).
*/
struct module *find_module_all(const char *name, size_t len,
bool even_unformed)
{
struct module *mod;
- module_assert_mutex_or_preempt();
-
list_for_each_entry_rcu(mod, &modules, list,
lockdep_is_held(&module_mutex)) {
if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
@@ -454,8 +450,7 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
struct module *mod;
unsigned int cpu;
- preempt_disable();
-
+ guard(rcu)();
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
@@ -472,13 +467,10 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
per_cpu_ptr(mod->percpu,
get_boot_cpu_id());
}
- preempt_enable();
return true;
}
}
}
-
- preempt_enable();
return false;
}
@@ -795,8 +787,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
async_synchronize_full();
/* Store the name and taints of the last unloaded module for diagnostic purposes */
- strscpy(last_unloaded_module.name, mod->name, sizeof(last_unloaded_module.name));
- strscpy(last_unloaded_module.taints, module_flags(mod, buf, false), sizeof(last_unloaded_module.taints));
+ strscpy(last_unloaded_module.name, mod->name);
+ strscpy(last_unloaded_module.taints, module_flags(mod, buf, false));
free_module(mod);
/* someone could wait for the module in add_unformed_module() */
@@ -814,10 +806,9 @@ void __symbol_put(const char *symbol)
.gplok = true,
};
- preempt_disable();
+ guard(rcu)();
BUG_ON(!find_symbol(&fsa));
module_put(fsa.owner);
- preempt_enable();
}
EXPORT_SYMBOL(__symbol_put);
@@ -832,13 +823,12 @@ void symbol_put_addr(void *addr)
/*
* Even though we hold a reference on the module; we still need to
- * disable preemption in order to safely traverse the data structure.
+ * RCU read section in order to safely traverse the data structure.
*/
- preempt_disable();
+ guard(rcu)();
modaddr = __module_text_address(a);
BUG_ON(!modaddr);
module_put(modaddr);
- preempt_enable();
}
EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -1189,7 +1179,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
getname:
/* We must make copy under the lock if we failed to get ref. */
- strncpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN);
+ strscpy(ownername, module_name(fsa.owner), MODULE_NAME_LEN);
unlock:
mutex_unlock(&module_mutex);
return fsa.sym;
@@ -1221,18 +1211,6 @@ void __weak module_arch_freeing_init(struct module *mod)
{
}
-void *__module_writable_address(struct module *mod, void *loc)
-{
- for_class_mod_mem_type(type, text) {
- struct module_memory *mem = &mod->mem[type];
-
- if (loc >= mem->base && loc < mem->base + mem->size)
- return loc + (mem->rw_copy - mem->base);
- }
-
- return loc;
-}
-
static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
{
unsigned int size = PAGE_ALIGN(mod->mem[type].size);
@@ -1250,21 +1228,15 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
if (!ptr)
return -ENOMEM;
- mod->mem[type].base = ptr;
-
if (execmem_is_rox(execmem_type)) {
- ptr = vzalloc(size);
+ int err = execmem_make_temp_rw(ptr, size);
- if (!ptr) {
- execmem_free(mod->mem[type].base);
+ if (err) {
+ execmem_free(ptr);
return -ENOMEM;
}
- mod->mem[type].rw_copy = ptr;
mod->mem[type].is_rox = true;
- } else {
- mod->mem[type].rw_copy = mod->mem[type].base;
- memset(mod->mem[type].base, 0, size);
}
/*
@@ -1278,18 +1250,29 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
* *do* eventually get freed, but let's just keep things simple
* and avoid *any* false positives.
*/
- kmemleak_not_leak(ptr);
+ if (!mod->mem[type].is_rox)
+ kmemleak_not_leak(ptr);
+
+ memset(ptr, 0, size);
+ mod->mem[type].base = ptr;
return 0;
}
+static void module_memory_restore_rox(struct module *mod)
+{
+ for_class_mod_mem_type(type, text) {
+ struct module_memory *mem = &mod->mem[type];
+
+ if (mem->is_rox)
+ execmem_restore_rox(mem->base, mem->size);
+ }
+}
+
static void module_memory_free(struct module *mod, enum mod_mem_type type)
{
struct module_memory *mem = &mod->mem[type];
- if (mem->is_rox)
- vfree(mem->rw_copy);
-
execmem_free(mem->base);
}
@@ -1348,7 +1331,7 @@ static void free_module(struct module *mod)
mod_tree_remove(mod);
/* Remove this module from bug list, this uses list_del_rcu */
module_bug_cleanup(mod);
- /* Wait for RCU-sched synchronizing before releasing mod->list and buglist. */
+ /* Wait for RCU synchronizing before releasing mod->list and buglist. */
synchronize_rcu();
if (try_add_tainted_module(mod))
pr_err("%s: adding tainted module to the unloaded tainted modules list failed.\n",
@@ -1371,21 +1354,18 @@ void *__symbol_get(const char *symbol)
.warn = true,
};
- preempt_disable();
- if (!find_symbol(&fsa))
- goto fail;
- if (fsa.license != GPL_ONLY) {
- pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n",
- symbol);
- goto fail;
+ scoped_guard(rcu) {
+ if (!find_symbol(&fsa))
+ return NULL;
+ if (fsa.license != GPL_ONLY) {
+ pr_warn("failing symbol_get of non-GPLONLY symbol %s.\n",
+ symbol);
+ return NULL;
+ }
+ if (strong_try_module_get(fsa.owner))
+ return NULL;
}
- if (strong_try_module_get(fsa.owner))
- goto fail;
- preempt_enable();
return (void *)kernel_symbol_value(fsa.sym);
-fail:
- preempt_enable();
- return NULL;
}
EXPORT_SYMBOL_GPL(__symbol_get);
@@ -2642,7 +2622,6 @@ static int move_module(struct module *mod, struct load_info *info)
for_each_mod_mem_type(type) {
if (!mod->mem[type].size) {
mod->mem[type].base = NULL;
- mod->mem[type].rw_copy = NULL;
continue;
}
@@ -2659,7 +2638,6 @@ static int move_module(struct module *mod, struct load_info *info)
void *dest;
Elf_Shdr *shdr = &info->sechdrs[i];
const char *sname;
- unsigned long addr;
if (!(shdr->sh_flags & SHF_ALLOC))
continue;
@@ -2680,14 +2658,12 @@ static int move_module(struct module *mod, struct load_info *info)
ret = PTR_ERR(dest);
goto out_err;
}
- addr = (unsigned long)dest;
codetag_section_found = true;
} else {
enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK;
- addr = (unsigned long)mod->mem[type].base + offset;
- dest = mod->mem[type].rw_copy + offset;
+ dest = mod->mem[type].base + offset;
}
if (shdr->sh_type != SHT_NOBITS) {
@@ -2710,13 +2686,14 @@ static int move_module(struct module *mod, struct load_info *info)
* users of info can keep taking advantage and using the newly
* minted official memory area.
*/
- shdr->sh_addr = addr;
+ shdr->sh_addr = (unsigned long)dest;
pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
(long)shdr->sh_size, info->secstrings + shdr->sh_name);
}
return 0;
out_err:
+ module_memory_restore_rox(mod);
for (t--; t >= 0; t--)
module_memory_free(mod, t);
if (codetag_section_found)
@@ -2863,17 +2840,8 @@ int __weak module_finalize(const Elf_Ehdr *hdr,
return 0;
}
-int __weak module_post_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
-{
- return 0;
-}
-
static int post_relocation(struct module *mod, const struct load_info *info)
{
- int ret;
-
/* Sort exception table now relocations are done. */
sort_extable(mod->extable, mod->extable + mod->num_exentries);
@@ -2885,24 +2853,7 @@ static int post_relocation(struct module *mod, const struct load_info *info)
add_kallsyms(mod, info);
/* Arch-specific module finalizing. */
- ret = module_finalize(info->hdr, info->sechdrs, mod);
- if (ret)
- return ret;
-
- for_each_mod_mem_type(type) {
- struct module_memory *mem = &mod->mem[type];
-
- if (mem->is_rox) {
- if (!execmem_update_copy(mem->base, mem->rw_copy,
- mem->size))
- return -ENOMEM;
-
- vfree(mem->rw_copy);
- mem->rw_copy = NULL;
- }
- }
-
- return module_post_finalize(info->hdr, info->sechdrs, mod);
+ return module_finalize(info->hdr, info->sechdrs, mod);
}
/* Call module constructors. */
@@ -3049,7 +3000,7 @@ static noinline int do_init_module(struct module *mod)
#endif
/*
* We want to free module_init, but be aware that kallsyms may be
- * walking this with preempt disabled. In all the failure paths, we
+ * walking this within an RCU read section. In all the failure paths, we
* call synchronize_rcu(), but we don't want to slow down the success
* path. execmem_free() cannot be called in an interrupt, so do the
* work and call synchronize_rcu() in a work queue.
@@ -3499,6 +3450,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
mod->mem[type].size);
}
+ module_memory_restore_rox(mod);
module_deallocate(mod, info);
free_copy:
/*
@@ -3715,28 +3667,23 @@ out:
/* Given an address, look for it in the module exception tables. */
const struct exception_table_entry *search_module_extables(unsigned long addr)
{
- const struct exception_table_entry *e = NULL;
struct module *mod;
- preempt_disable();
+ guard(rcu)();
mod = __module_address(addr);
if (!mod)
- goto out;
+ return NULL;
if (!mod->num_exentries)
- goto out;
-
- e = search_extable(mod->extable,
- mod->num_exentries,
- addr);
-out:
- preempt_enable();
-
+ return NULL;
/*
- * Now, if we found one, we are running inside it now, hence
- * we cannot unload the module, hence no refcnt needed.
+ * The address passed here belongs to a module that is currently
+ * invoked (we are running inside it). Therefore its module::refcnt
+ * needs already be >0 to ensure that it is not removed at this stage.
+ * All other user need to invoke this function within a RCU read
+ * section.
*/
- return e;
+ return search_extable(mod->extable, mod->num_exentries, addr);
}
/**
@@ -3748,20 +3695,15 @@ out:
*/
bool is_module_address(unsigned long addr)
{
- bool ret;
-
- preempt_disable();
- ret = __module_address(addr) != NULL;
- preempt_enable();
-
- return ret;
+ guard(rcu)();
+ return __module_address(addr) != NULL;
}
/**
* __module_address() - get the module which contains an address.
* @addr: the address.
*
- * Must be called with preempt disabled or module mutex held so that
+ * Must be called within RCU read section or module mutex held so that
* module doesn't get freed during this.
*/
struct module *__module_address(unsigned long addr)
@@ -3779,8 +3721,6 @@ struct module *__module_address(unsigned long addr)
return NULL;
lookup:
- module_assert_mutex_or_preempt();
-
mod = mod_find(addr, &mod_tree);
if (mod) {
BUG_ON(!within_module(addr, mod));
@@ -3800,20 +3740,28 @@ lookup:
*/
bool is_module_text_address(unsigned long addr)
{
- bool ret;
+ guard(rcu)();
+ return __module_text_address(addr) != NULL;
+}
- preempt_disable();
- ret = __module_text_address(addr) != NULL;
- preempt_enable();
+void module_for_each_mod(int(*func)(struct module *mod, void *data), void *data)
+{
+ struct module *mod;
- return ret;
+ guard(rcu)();
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+ if (func(mod, data))
+ break;
+ }
}
/**
* __module_text_address() - get the module whose code contains an address.
* @addr: the address.
*
- * Must be called with preempt disabled or module mutex held so that
+ * Must be called within RCU read section or module mutex held so that
* module doesn't get freed during this.
*/
struct module *__module_text_address(unsigned long addr)
@@ -3836,7 +3784,7 @@ void print_modules(void)
printk(KERN_DEFAULT "Modules linked in:");
/* Most callers should already have preempt disabled, but make sure */
- preempt_disable();
+ guard(rcu)();
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
@@ -3844,7 +3792,6 @@ void print_modules(void)
}
print_unloaded_tainted_modules();
- preempt_enable();
if (last_unloaded_module.name[0])
pr_cont(" [last unloaded: %s%s]", last_unloaded_module.name,
last_unloaded_module.taints);
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index 74834ba15615..03f4142cfbf4 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
+#include <linux/execmem.h>
#include "internal.h"
static int module_set_memory(const struct module *mod, enum mod_mem_type type,
@@ -32,12 +33,12 @@ static int module_set_memory(const struct module *mod, enum mod_mem_type type,
int module_enable_text_rox(const struct module *mod)
{
for_class_mod_mem_type(type, text) {
+ const struct module_memory *mem = &mod->mem[type];
int ret;
- if (mod->mem[type].is_rox)
- continue;
-
- if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ if (mem->is_rox)
+ ret = execmem_restore_rox(mem->base, mem->size);
+ else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
ret = module_set_memory(mod, type, set_memory_rox);
else
ret = module_set_memory(mod, type, set_memory_x);
diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c
index 16742d1c630c..4fefec5b683c 100644
--- a/kernel/module/tracking.c
+++ b/kernel/module/tracking.c
@@ -21,8 +21,6 @@ int try_add_tainted_module(struct module *mod)
{
struct mod_unload_taint *mod_taint;
- module_assert_mutex_or_preempt();
-
if (!mod->taints)
goto out;
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
index 277197977d43..d3204c5c74eb 100644
--- a/kernel/module/tree_lookup.c
+++ b/kernel/module/tree_lookup.c
@@ -12,11 +12,11 @@
/*
* Use a latched RB-tree for __module_address(); this allows us to use
- * RCU-sched lookups of the address from any context.
+ * RCU lookups of the address from any context.
*
- * This is conditional on PERF_EVENTS || TRACING because those can really hit
- * __module_address() hard by doing a lot of stack unwinding; potentially from
- * NMI context.
+ * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can
+ * really hit __module_address() hard by doing a lot of stack unwinding;
+ * potentially from NMI context.
*/
static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
diff --git a/kernel/module/version.c b/kernel/module/version.c
index 3718a8868321..2beefeba82d9 100644
--- a/kernel/module/version.c
+++ b/kernel/module/version.c
@@ -79,17 +79,17 @@ int check_modstruct_version(const struct load_info *info,
.name = "module_layout",
.gplok = true,
};
+ bool have_symbol;
/*
* Since this should be found in kernel (which can't be removed), no
- * locking is necessary -- use preempt_disable() to placate lockdep.
+ * locking is necessary. Regardless use a RCU read section to keep
+ * lockdep happy.
*/
- preempt_disable();
- if (!find_symbol(&fsa)) {
- preempt_enable();
- BUG();
- }
- preempt_enable();
+ scoped_guard(rcu)
+ have_symbol = find_symbol(&fsa);
+ BUG_ON(!have_symbol);
+
return check_version(info, "module_layout", mod, fsa.crc);
}
diff --git a/kernel/padata.c b/kernel/padata.c
index 418987056340..b3d4eacc4f5d 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -290,7 +290,7 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
if (remove_object) {
list_del_init(&padata->list);
++pd->processed;
- pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false);
+ pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu);
}
spin_unlock(&reorder->lock);
diff --git a/kernel/panic.c b/kernel/panic.c
index d8635d5cecb2..0c55eec9e874 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -511,6 +511,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
TAINT_FLAG(AUX, 'X', ' ', true),
TAINT_FLAG(RANDSTRUCT, 'T', ' ', true),
TAINT_FLAG(TEST, 'N', ' ', true),
+ TAINT_FLAG(FWCTL, 'J', ' ', true),
};
#undef TAINT_FLAG
diff --git a/kernel/params.c b/kernel/params.c
index 0074d29c9b80..2509f216c9f3 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -551,7 +551,7 @@ struct module_param_attrs
{
unsigned int num;
struct attribute_group grp;
- struct param_attribute attrs[];
+ struct param_attribute attrs[] __counted_by(num);
};
#ifdef CONFIG_SYSFS
@@ -651,35 +651,32 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
}
/* Enlarge allocations. */
- new_mp = krealloc(mk->mp,
- sizeof(*mk->mp) +
- sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1),
+ new_mp = krealloc(mk->mp, struct_size(mk->mp, attrs, mk->mp->num + 1),
GFP_KERNEL);
if (!new_mp)
return -ENOMEM;
mk->mp = new_mp;
+ mk->mp->num++;
/* Extra pointer for NULL terminator */
- new_attrs = krealloc(mk->mp->grp.attrs,
- sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2),
- GFP_KERNEL);
+ new_attrs = krealloc_array(mk->mp->grp.attrs, mk->mp->num + 1,
+ sizeof(mk->mp->grp.attrs[0]), GFP_KERNEL);
if (!new_attrs)
return -ENOMEM;
mk->mp->grp.attrs = new_attrs;
/* Tack new one on the end. */
- memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0]));
- sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr);
- mk->mp->attrs[mk->mp->num].param = kp;
- mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show;
+ memset(&mk->mp->attrs[mk->mp->num - 1], 0, sizeof(mk->mp->attrs[0]));
+ sysfs_attr_init(&mk->mp->attrs[mk->mp->num - 1].mattr.attr);
+ mk->mp->attrs[mk->mp->num - 1].param = kp;
+ mk->mp->attrs[mk->mp->num - 1].mattr.show = param_attr_show;
/* Do not allow runtime DAC changes to make param writable. */
if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
- mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store;
+ mk->mp->attrs[mk->mp->num - 1].mattr.store = param_attr_store;
else
- mk->mp->attrs[mk->mp->num].mattr.store = NULL;
- mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name;
- mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm;
- mk->mp->num++;
+ mk->mp->attrs[mk->mp->num - 1].mattr.store = NULL;
+ mk->mp->attrs[mk->mp->num - 1].mattr.attr.name = (char *)name;
+ mk->mp->attrs[mk->mp->num - 1].mattr.attr.mode = kp->perm;
/* Fix up all the pointers, since krealloc can move us */
for (i = 0; i < mk->mp->num; i++)
diff --git a/kernel/pid.c b/kernel/pid.c
index 924084713be8..4ac2ce46817f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -88,20 +88,6 @@ struct pid_namespace init_pid_ns = {
};
EXPORT_SYMBOL_GPL(init_pid_ns);
-/*
- * Note: disable interrupts while the pidmap_lock is held as an
- * interrupt might come in and do read_lock(&tasklist_lock).
- *
- * If we don't disable interrupts there is a nasty deadlock between
- * detach_pid()->free_pid() and another cpu that does
- * spin_lock(&pidmap_lock) followed by an interrupt routine that does
- * read_lock(&tasklist_lock);
- *
- * After we clean up the tasklist_lock and know there are no
- * irq handlers that take it we can leave the interrupts enabled.
- * For now it is easier to be safe than to prove it can't happen.
- */
-
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
@@ -128,11 +114,11 @@ static void delayed_put_pid(struct rcu_head *rhp)
void free_pid(struct pid *pid)
{
- /* We can be called with write_lock_irq(&tasklist_lock) held */
int i;
- unsigned long flags;
- spin_lock_irqsave(&pidmap_lock, flags);
+ lockdep_assert_not_held(&tasklist_lock);
+
+ spin_lock(&pidmap_lock);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
@@ -155,11 +141,23 @@ void free_pid(struct pid *pid)
idr_remove(&ns->idr, upid->nr);
}
pidfs_remove_pid(pid);
- spin_unlock_irqrestore(&pidmap_lock, flags);
+ spin_unlock(&pidmap_lock);
call_rcu(&pid->rcu, delayed_put_pid);
}
+void free_pids(struct pid **pids)
+{
+ int tmp;
+
+ /*
+ * This can batch pidmap_lock.
+ */
+ for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+ if (pids[tmp])
+ free_pid(pids[tmp]);
+}
+
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size)
{
@@ -211,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
}
idr_preload(GFP_KERNEL);
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -238,7 +236,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
}
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
idr_preload_end();
if (nr < 0) {
@@ -272,7 +270,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
upid = pid->numbers + ns->level;
idr_preload(GFP_KERNEL);
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
pidfs_add_pid(pid);
@@ -281,18 +279,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
}
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
idr_preload_end();
return pid;
out_unlock:
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
idr_preload_end();
put_pid_ns(ns);
out_free:
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
while (++i <= ns->level) {
upid = pid->numbers + i;
idr_remove(&upid->ns->idr, upid->nr);
@@ -302,7 +300,7 @@ out_free:
if (ns->pid_allocated == PIDNS_ADDING)
idr_set_cursor(&ns->idr, 0);
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -310,9 +308,9 @@ out_free:
void disable_pid_allocation(struct pid_namespace *ns)
{
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
ns->pid_allocated &= ~PIDNS_ADDING;
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
@@ -339,17 +337,23 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
*/
void attach_pid(struct task_struct *task, enum pid_type type)
{
- struct pid *pid = *task_pid_ptr(task, type);
+ struct pid *pid;
+
+ lockdep_assert_held_write(&tasklist_lock);
+
+ pid = *task_pid_ptr(task, type);
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}
-static void __change_pid(struct task_struct *task, enum pid_type type,
- struct pid *new)
+static void __change_pid(struct pid **pids, struct task_struct *task,
+ enum pid_type type, struct pid *new)
{
- struct pid **pid_ptr = task_pid_ptr(task, type);
- struct pid *pid;
+ struct pid **pid_ptr, *pid;
int tmp;
+ lockdep_assert_held_write(&tasklist_lock);
+
+ pid_ptr = task_pid_ptr(task, type);
pid = *pid_ptr;
hlist_del_rcu(&task->pid_links[type]);
@@ -364,18 +368,19 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
if (pid_has_task(pid, tmp))
return;
- free_pid(pid);
+ WARN_ON(pids[type]);
+ pids[type] = pid;
}
-void detach_pid(struct task_struct *task, enum pid_type type)
+void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type)
{
- __change_pid(task, type, NULL);
+ __change_pid(pids, task, type, NULL);
}
-void change_pid(struct task_struct *task, enum pid_type type,
+void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type,
struct pid *pid)
{
- __change_pid(task, type, pid);
+ __change_pid(pids, task, type, pid);
attach_pid(task, type);
}
@@ -386,6 +391,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right)
struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
+ lockdep_assert_held_write(&tasklist_lock);
+
/* Swap the single entry tid lists */
hlists_swap_heads_rcu(head1, head2);
@@ -403,6 +410,7 @@ void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
WARN_ON_ONCE(type == PIDTYPE_PID);
+ lockdep_assert_held_write(&tasklist_lock);
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}
@@ -564,15 +572,29 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
*/
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
{
- unsigned int f_flags;
+ unsigned int f_flags = 0;
struct pid *pid;
struct task_struct *task;
+ enum pid_type type;
- pid = pidfd_get_pid(pidfd, &f_flags);
- if (IS_ERR(pid))
- return ERR_CAST(pid);
+ switch (pidfd) {
+ case PIDFD_SELF_THREAD:
+ type = PIDTYPE_PID;
+ pid = get_task_pid(current, type);
+ break;
+ case PIDFD_SELF_THREAD_GROUP:
+ type = PIDTYPE_TGID;
+ pid = get_task_pid(current, type);
+ break;
+ default:
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+ type = PIDTYPE_TGID;
+ break;
+ }
- task = get_pid_task(pid, PIDTYPE_TGID);
+ task = get_pid_task(pid, type);
put_pid(pid);
if (!task)
return ERR_PTR(-ESRCH);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ca947ed32e3d..54a623680019 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -380,8 +380,7 @@ config CPU_PM
config ENERGY_MODEL
bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)"
- depends on SMP
- depends on CPU_FREQ
+ depends on CPU_FREQ || PM_DEVFREQ
help
Several subsystems (thermal and/or the task scheduler for example)
can leverage information about the energy consumed by devices to
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 3874f0e97651..d9b7e2b38c7a 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -161,22 +161,10 @@ static void em_debug_create_pd(struct device *dev) {}
static void em_debug_remove_pd(struct device *dev) {}
#endif
-static void em_destroy_table_rcu(struct rcu_head *rp)
-{
- struct em_perf_table __rcu *table;
-
- table = container_of(rp, struct em_perf_table, rcu);
- kfree(table);
-}
-
static void em_release_table_kref(struct kref *kref)
{
- struct em_perf_table __rcu *table;
-
/* It was the last owner of this table so we can free */
- table = container_of(kref, struct em_perf_table, kref);
-
- call_rcu(&table->rcu, em_destroy_table_rcu);
+ kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu);
}
/**
@@ -185,7 +173,7 @@ static void em_release_table_kref(struct kref *kref)
*
* No return values.
*/
-void em_table_free(struct em_perf_table __rcu *table)
+void em_table_free(struct em_perf_table *table)
{
kref_put(&table->kref, em_release_table_kref);
}
@@ -198,9 +186,9 @@ void em_table_free(struct em_perf_table __rcu *table)
* has a user.
* Returns allocated table or NULL.
*/
-struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd)
+struct em_perf_table *em_table_alloc(struct em_perf_domain *pd)
{
- struct em_perf_table __rcu *table;
+ struct em_perf_table *table;
int table_size;
table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
@@ -239,7 +227,7 @@ static void em_init_performance(struct device *dev, struct em_perf_domain *pd,
}
static int em_compute_costs(struct device *dev, struct em_perf_state *table,
- struct em_data_callback *cb, int nr_states,
+ const struct em_data_callback *cb, int nr_states,
unsigned long flags)
{
unsigned long prev_cost = ULONG_MAX;
@@ -308,9 +296,9 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
* Return 0 on success or an error code on failure.
*/
int em_dev_update_perf_domain(struct device *dev,
- struct em_perf_table __rcu *new_table)
+ struct em_perf_table *new_table)
{
- struct em_perf_table __rcu *old_table;
+ struct em_perf_table *old_table;
struct em_perf_domain *pd;
if (!dev)
@@ -327,7 +315,8 @@ int em_dev_update_perf_domain(struct device *dev,
kref_get(&new_table->kref);
- old_table = pd->em_table;
+ old_table = rcu_dereference_protected(pd->em_table,
+ lockdep_is_held(&em_pd_mutex));
rcu_assign_pointer(pd->em_table, new_table);
em_cpufreq_update_efficiencies(dev, new_table->state);
@@ -341,7 +330,7 @@ EXPORT_SYMBOL_GPL(em_dev_update_perf_domain);
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
struct em_perf_state *table,
- struct em_data_callback *cb,
+ const struct em_data_callback *cb,
unsigned long flags)
{
unsigned long power, freq, prev_freq = 0;
@@ -396,10 +385,11 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
}
static int em_create_pd(struct device *dev, int nr_states,
- struct em_data_callback *cb, cpumask_t *cpus,
+ const struct em_data_callback *cb,
+ const cpumask_t *cpus,
unsigned long flags)
{
- struct em_perf_table __rcu *em_table;
+ struct em_perf_table *em_table;
struct em_perf_domain *pd;
struct device *cpu_dev;
int cpu, ret, num_cpus;
@@ -556,9 +546,10 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
* Return 0 on success
*/
int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
- struct em_data_callback *cb, cpumask_t *cpus,
- bool microwatts)
+ const struct em_data_callback *cb,
+ const cpumask_t *cpus, bool microwatts)
{
+ struct em_perf_table *em_table;
unsigned long cap, prev_cap = 0;
unsigned long flags = 0;
int cpu, ret;
@@ -631,7 +622,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
dev->em_pd->min_perf_state = 0;
dev->em_pd->max_perf_state = nr_states - 1;
- em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
+ em_table = rcu_dereference_protected(dev->em_pd->em_table,
+ lockdep_is_held(&em_pd_mutex));
+ em_cpufreq_update_efficiencies(dev, em_table->state);
em_debug_create_pd(dev);
dev_info(dev, "EM: created perf domain\n");
@@ -668,7 +661,8 @@ void em_dev_unregister_perf_domain(struct device *dev)
mutex_lock(&em_pd_mutex);
em_debug_remove_pd(dev);
- em_table_free(dev->em_pd->em_table);
+ em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
+ lockdep_is_held(&em_pd_mutex)));
kfree(dev->em_pd);
dev->em_pd = NULL;
@@ -676,9 +670,9 @@ void em_dev_unregister_perf_domain(struct device *dev)
}
EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
-static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd)
+static struct em_perf_table *em_table_dup(struct em_perf_domain *pd)
{
- struct em_perf_table __rcu *em_table;
+ struct em_perf_table *em_table;
struct em_perf_state *ps, *new_ps;
int ps_size;
@@ -700,7 +694,7 @@ static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd)
}
static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd,
- struct em_perf_table __rcu *em_table)
+ struct em_perf_table *em_table)
{
int ret;
@@ -728,10 +722,9 @@ free_em_table:
* are correctly calculated.
*/
static void em_adjust_new_capacity(struct device *dev,
- struct em_perf_domain *pd,
- u64 max_cap)
+ struct em_perf_domain *pd)
{
- struct em_perf_table __rcu *em_table;
+ struct em_perf_table *em_table;
em_table = em_table_dup(pd);
if (!em_table) {
@@ -775,7 +768,8 @@ static void em_check_capacity_update(void)
}
cpufreq_cpu_put(policy);
- pd = em_cpu_get(cpu);
+ dev = get_cpu_device(cpu);
+ pd = em_pd_get(dev);
if (!pd || em_is_artificial(pd))
continue;
@@ -799,8 +793,7 @@ static void em_check_capacity_update(void)
pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n",
cpu, cpu_capacity, em_max_perf);
- dev = get_cpu_device(cpu);
- em_adjust_new_capacity(dev, pd, cpu_capacity);
+ em_adjust_new_capacity(dev, pd);
}
free_cpumask_var(cpu_done_mask);
@@ -822,7 +815,7 @@ static void em_update_workfn(struct work_struct *work)
*/
int em_dev_update_chip_binning(struct device *dev)
{
- struct em_perf_table __rcu *em_table;
+ struct em_perf_table *em_table;
struct em_perf_domain *pd;
int i, ret;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 10a01af63a80..23c0f4e6cb2f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) "PM: hibernation: " fmt
+#include <crypto/acompress.h>
#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/suspend.h>
@@ -411,7 +412,7 @@ int hibernation_snapshot(int platform_mode)
goto Thaw;
}
- suspend_console();
+ console_suspend_all();
pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
@@ -437,7 +438,7 @@ int hibernation_snapshot(int platform_mode)
if (error || !in_suspend)
pm_restore_gfp_mask();
- resume_console();
+ console_resume_all();
dpm_complete(msg);
Close:
@@ -547,7 +548,7 @@ int hibernation_restore(int platform_mode)
int error;
pm_prepare_console();
- suspend_console();
+ console_suspend_all();
pm_restrict_gfp_mask();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
@@ -561,7 +562,7 @@ int hibernation_restore(int platform_mode)
}
dpm_resume_end(PMSG_RECOVER);
pm_restore_gfp_mask();
- resume_console();
+ console_resume_all();
pm_restore_console();
return error;
}
@@ -586,7 +587,7 @@ int hibernation_platform_enter(void)
goto Close;
entering_platform_hibernation = true;
- suspend_console();
+ console_suspend_all();
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -639,7 +640,7 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
- resume_console();
+ console_resume_all();
Close:
hibernation_ops->end();
@@ -757,7 +758,7 @@ int hibernate(void)
*/
if (!nocompress) {
strscpy(hib_comp_algo, hibernate_compressor, sizeof(hib_comp_algo));
- if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) {
+ if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) {
pr_err("%s compression is not available\n", hib_comp_algo);
return -EOPNOTSUPP;
}
@@ -901,7 +902,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data)
if (error)
goto dpm_complete;
- suspend_console();
+ console_suspend_all();
error = dpm_suspend(PMSG_FREEZE);
if (error)
@@ -925,7 +926,7 @@ skip:
dpm_resume:
dpm_resume(PMSG_THAW);
- resume_console();
+ console_resume_all();
dpm_complete:
dpm_complete(PMSG_THAW);
@@ -1008,7 +1009,7 @@ static int software_resume(void)
strscpy(hib_comp_algo, COMPRESSION_ALGO_LZ4, sizeof(hib_comp_algo));
else
strscpy(hib_comp_algo, COMPRESSION_ALGO_LZO, sizeof(hib_comp_algo));
- if (crypto_has_comp(hib_comp_algo, 0, 0) != 1) {
+ if (!crypto_has_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC)) {
pr_err("%s compression is not available\n", hib_comp_algo);
error = -EOPNOTSUPP;
goto Unlock;
@@ -1446,10 +1447,10 @@ static const char * const comp_alg_enabled[] = {
static int hibernate_compressor_param_set(const char *compressor,
const struct kernel_param *kp)
{
- unsigned int sleep_flags;
int index, ret;
- sleep_flags = lock_system_sleep();
+ if (!mutex_trylock(&system_transition_mutex))
+ return -EBUSY;
index = sysfs_match_string(comp_alg_enabled, compressor);
if (index >= 0) {
@@ -1461,7 +1462,7 @@ static int hibernate_compressor_param_set(const char *compressor,
ret = index;
}
- unlock_system_sleep(sleep_flags);
+ mutex_unlock(&system_transition_mutex);
if (ret)
pr_debug("Cannot set specified compressor %s\n",
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c9fb559a6399..4e6e24e8b854 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2270,9 +2270,9 @@ int snapshot_read_next(struct snapshot_handle *handle)
*/
void *kaddr;
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
copy_page(buffer, kaddr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
handle->buffer = buffer;
} else {
handle->buffer = page_address(page);
@@ -2561,9 +2561,9 @@ static void copy_last_highmem_page(void)
if (last_highmem_page) {
void *dst;
- dst = kmap_atomic(last_highmem_page);
+ dst = kmap_local_page(last_highmem_page);
copy_page(dst, buffer);
- kunmap_atomic(dst);
+ kunmap_local(dst);
last_highmem_page = NULL;
}
}
@@ -2881,13 +2881,13 @@ static inline void swap_two_pages_data(struct page *p1, struct page *p2,
{
void *kaddr1, *kaddr2;
- kaddr1 = kmap_atomic(p1);
- kaddr2 = kmap_atomic(p2);
+ kaddr1 = kmap_local_page(p1);
+ kaddr2 = kmap_local_page(p2);
copy_page(buf, kaddr1);
copy_page(kaddr1, kaddr2);
copy_page(kaddr2, buf);
- kunmap_atomic(kaddr2);
- kunmap_atomic(kaddr1);
+ kunmap_local(kaddr2);
+ kunmap_local(kaddr1);
}
/**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 09f8397bae15..8eaec4ab121d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -91,6 +91,16 @@ static void s2idle_enter(void)
{
trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
+ /*
+ * The correctness of the code below depends on the number of online
+ * CPUs being stable, but CPUs cannot be taken offline or put online
+ * while it is running.
+ *
+ * The s2idle_lock must be acquired before the pending wakeup check to
+ * prevent pm_system_wakeup() from running as a whole between that check
+ * and the subsequent s2idle_state update in which case a wakeup event
+ * would get lost.
+ */
raw_spin_lock_irq(&s2idle_lock);
if (pm_wakeup_pending())
goto out;
@@ -98,8 +108,6 @@ static void s2idle_enter(void)
s2idle_state = S2IDLE_STATE_ENTER;
raw_spin_unlock_irq(&s2idle_lock);
- cpus_read_lock();
-
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
/* Make the current CPU wait so it can enter the idle loop too. */
@@ -112,8 +120,6 @@ static void s2idle_enter(void)
*/
wake_up_all_idle_cpus();
- cpus_read_unlock();
-
raw_spin_lock_irq(&s2idle_lock);
out:
@@ -502,7 +508,7 @@ int suspend_devices_and_enter(suspend_state_t state)
if (error)
goto Close;
- suspend_console();
+ console_suspend_all();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
@@ -521,9 +527,9 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
- trace_suspend_resume(TPS("resume_console"), state, true);
- resume_console();
- trace_suspend_resume(TPS("resume_console"), state, false);
+ trace_suspend_resume(TPS("console_resume_all"), state, true);
+ console_resume_all();
+ trace_suspend_resume(TPS("console_resume_all"), state, false);
Close:
platform_resume_end(state);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 82b884b67152..80ff5f933a62 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,6 +12,7 @@
#define pr_fmt(fmt) "PM: " fmt
+#include <crypto/acompress.h>
#include <linux/module.h>
#include <linux/file.h>
#include <linux/delay.h>
@@ -635,7 +636,8 @@ static int crc32_threadfn(void *data)
*/
struct cmp_data {
struct task_struct *thr; /* thread */
- struct crypto_comp *cc; /* crypto compressor stream */
+ struct crypto_acomp *cc; /* crypto compressor */
+ struct acomp_req *cr; /* crypto request */
atomic_t ready; /* ready to start flag */
atomic_t stop; /* ready to stop flag */
int ret; /* return code */
@@ -656,7 +658,6 @@ static atomic_t compressed_size = ATOMIC_INIT(0);
static int compress_threadfn(void *data)
{
struct cmp_data *d = data;
- unsigned int cmp_len = 0;
while (1) {
wait_event(d->go, atomic_read_acquire(&d->ready) ||
@@ -670,11 +671,13 @@ static int compress_threadfn(void *data)
}
atomic_set(&d->ready, 0);
- cmp_len = CMP_SIZE - CMP_HEADER;
- d->ret = crypto_comp_compress(d->cc, d->unc, d->unc_len,
- d->cmp + CMP_HEADER,
- &cmp_len);
- d->cmp_len = cmp_len;
+ acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
+ acomp_request_set_src_nondma(d->cr, d->unc, d->unc_len);
+ acomp_request_set_dst_nondma(d->cr, d->cmp + CMP_HEADER,
+ CMP_SIZE - CMP_HEADER);
+ d->ret = crypto_acomp_compress(d->cr);
+ d->cmp_len = d->cr->dlen;
atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len);
atomic_set_release(&d->stop, 1);
@@ -745,13 +748,20 @@ static int save_compressed_image(struct swap_map_handle *handle,
init_waitqueue_head(&data[thr].go);
init_waitqueue_head(&data[thr].done);
- data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0);
+ data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR_OR_NULL(data[thr].cc)) {
pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc));
ret = -EFAULT;
goto out_clean;
}
+ data[thr].cr = acomp_request_alloc(data[thr].cc);
+ if (!data[thr].cr) {
+ pr_err("Could not allocate comp request\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+
data[thr].thr = kthread_run(compress_threadfn,
&data[thr],
"image_compress/%u", thr);
@@ -899,8 +909,8 @@ out_clean:
for (thr = 0; thr < nr_threads; thr++) {
if (data[thr].thr)
kthread_stop(data[thr].thr);
- if (data[thr].cc)
- crypto_free_comp(data[thr].cc);
+ acomp_request_free(data[thr].cr);
+ crypto_free_acomp(data[thr].cc);
}
vfree(data);
}
@@ -1142,7 +1152,8 @@ static int load_image(struct swap_map_handle *handle,
*/
struct dec_data {
struct task_struct *thr; /* thread */
- struct crypto_comp *cc; /* crypto compressor stream */
+ struct crypto_acomp *cc; /* crypto compressor */
+ struct acomp_req *cr; /* crypto request */
atomic_t ready; /* ready to start flag */
atomic_t stop; /* ready to stop flag */
int ret; /* return code */
@@ -1160,7 +1171,6 @@ struct dec_data {
static int decompress_threadfn(void *data)
{
struct dec_data *d = data;
- unsigned int unc_len = 0;
while (1) {
wait_event(d->go, atomic_read_acquire(&d->ready) ||
@@ -1174,10 +1184,13 @@ static int decompress_threadfn(void *data)
}
atomic_set(&d->ready, 0);
- unc_len = UNC_SIZE;
- d->ret = crypto_comp_decompress(d->cc, d->cmp + CMP_HEADER, d->cmp_len,
- d->unc, &unc_len);
- d->unc_len = unc_len;
+ acomp_request_set_callback(d->cr, CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
+ acomp_request_set_src_nondma(d->cr, d->cmp + CMP_HEADER,
+ d->cmp_len);
+ acomp_request_set_dst_nondma(d->cr, d->unc, UNC_SIZE);
+ d->ret = crypto_acomp_decompress(d->cr);
+ d->unc_len = d->cr->dlen;
if (clean_pages_on_decompress)
flush_icache_range((unsigned long)d->unc,
@@ -1254,13 +1267,20 @@ static int load_compressed_image(struct swap_map_handle *handle,
init_waitqueue_head(&data[thr].go);
init_waitqueue_head(&data[thr].done);
- data[thr].cc = crypto_alloc_comp(hib_comp_algo, 0, 0);
+ data[thr].cc = crypto_alloc_acomp(hib_comp_algo, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR_OR_NULL(data[thr].cc)) {
pr_err("Could not allocate comp stream %ld\n", PTR_ERR(data[thr].cc));
ret = -EFAULT;
goto out_clean;
}
+ data[thr].cr = acomp_request_alloc(data[thr].cc);
+ if (!data[thr].cr) {
+ pr_err("Could not allocate comp request\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+
data[thr].thr = kthread_run(decompress_threadfn,
&data[thr],
"image_decompress/%u", thr);
@@ -1507,8 +1527,8 @@ out_clean:
for (thr = 0; thr < nr_threads; thr++) {
if (data[thr].thr)
kthread_stop(data[thr].thr);
- if (data[thr].cc)
- crypto_free_comp(data[thr].cc);
+ acomp_request_free(data[thr].cr);
+ crypto_free_acomp(data[thr].cc);
}
vfree(data);
}
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index a91bdf802967..48a24e7b309d 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -64,6 +64,7 @@ struct dev_printk_info;
extern struct printk_ringbuffer *prb;
extern bool printk_kthreads_running;
+extern bool debug_non_panic_cpus;
__printf(4, 0)
int vprintk_store(int facility, int level,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 07668433644b..1eea80d0648e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2375,6 +2375,22 @@ void printk_legacy_allow_panic_sync(void)
}
}
+bool __read_mostly debug_non_panic_cpus;
+
+#ifdef CONFIG_PRINTK_CALLER
+static int __init debug_non_panic_cpus_setup(char *str)
+{
+ debug_non_panic_cpus = true;
+ pr_info("allow messages from non-panic CPUs in panic()\n");
+
+ return 0;
+}
+early_param("debug_non_panic_cpus", debug_non_panic_cpus_setup);
+module_param(debug_non_panic_cpus, bool, 0644);
+MODULE_PARM_DESC(debug_non_panic_cpus,
+ "allow messages from non-panic CPUs in panic()");
+#endif
+
asmlinkage int vprintk_emit(int facility, int level,
const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
@@ -2391,7 +2407,9 @@ asmlinkage int vprintk_emit(int facility, int level,
* non-panic CPUs are generating any messages, they will be
* silently dropped.
*/
- if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace)
+ if (other_cpu_in_panic() &&
+ !debug_non_panic_cpus &&
+ !panic_triggering_all_cpu_backtrace)
return 0;
printk_get_console_flush_type(&ft);
@@ -2461,7 +2479,6 @@ asmlinkage __visible int _printk(const char *fmt, ...)
}
EXPORT_SYMBOL(_printk);
-static bool pr_flush(int timeout_ms, bool reset_on_progress);
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
#else /* CONFIG_PRINTK */
@@ -2474,7 +2491,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
static u64 syslog_seq;
-static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
#endif /* CONFIG_PRINTK */
@@ -2733,11 +2749,11 @@ module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool
MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc");
/**
- * suspend_console - suspend the console subsystem
+ * console_suspend_all - suspend the console subsystem
*
* This disables printk() while we go into suspend states
*/
-void suspend_console(void)
+void console_suspend_all(void)
{
struct console *con;
@@ -2760,7 +2776,7 @@ void suspend_console(void)
synchronize_srcu(&console_srcu);
}
-void resume_console(void)
+void console_resume_all(void)
{
struct console_flush_type ft;
struct console *con;
@@ -3342,7 +3358,12 @@ void console_unblank(void)
*/
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
- if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) {
+ short flags = console_srcu_read_flags(c);
+
+ if (flags & CON_SUSPENDED)
+ continue;
+
+ if ((flags & CON_ENABLED) && c->unblank) {
found_unblank = true;
break;
}
@@ -3379,7 +3400,12 @@ void console_unblank(void)
cookie = console_srcu_read_lock();
for_each_console_srcu(c) {
- if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank)
+ short flags = console_srcu_read_flags(c);
+
+ if (flags & CON_SUSPENDED)
+ continue;
+
+ if ((flags & CON_ENABLED) && c->unblank)
c->unblank();
}
console_srcu_read_unlock(cookie);
@@ -3497,10 +3523,10 @@ struct tty_driver *console_device(int *index)
/*
* Prevent further output on the passed console device so that (for example)
- * serial drivers can disable console output before suspending a port, and can
+ * serial drivers can suspend console output before suspending a port, and can
* re-enable output afterwards.
*/
-void console_stop(struct console *console)
+void console_suspend(struct console *console)
{
__pr_flush(console, 1000, true);
console_list_lock();
@@ -3515,9 +3541,9 @@ void console_stop(struct console *console)
*/
synchronize_srcu(&console_srcu);
}
-EXPORT_SYMBOL(console_stop);
+EXPORT_SYMBOL(console_suspend);
-void console_start(struct console *console)
+void console_resume(struct console *console)
{
struct console_flush_type ft;
bool is_nbcon;
@@ -3542,7 +3568,7 @@ void console_start(struct console *console)
__pr_flush(console, 1000, true);
}
-EXPORT_SYMBOL(console_start);
+EXPORT_SYMBOL(console_resume);
#ifdef CONFIG_PRINTK
static int unregister_console_locked(struct console *console);
@@ -4277,6 +4303,11 @@ void __init console_init(void)
initcall_t call;
initcall_entry_t *ce;
+#ifdef CONFIG_NULL_TTY_DEFAULT_CONSOLE
+ if (!console_set_on_cmdline)
+ add_preferred_console("ttynull", 0, NULL);
+#endif
+
/* Setup the default TTY line discipline. */
n_tty_init();
@@ -4466,7 +4497,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
* Context: Process context. May sleep while acquiring console lock.
* Return: true if all usable printers are caught up.
*/
-static bool pr_flush(int timeout_ms, bool reset_on_progress)
+bool pr_flush(int timeout_ms, bool reset_on_progress)
{
return __pr_flush(NULL, timeout_ms, reset_on_progress);
}
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 88e8f3a61922..d9fb053cff67 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -2133,9 +2133,9 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
* there may be other finalized records beyond that
* need to be printed for a panic situation. If this
* is the panic CPU, skip this
- * non-existent/non-finalized record unless it is
- * at or beyond the head, in which case it is not
- * possible to continue.
+ * non-existent/non-finalized record unless non-panic
+ * CPUs are still running and their debugging is
+ * explicitly enabled.
*
* Note that new messages printed on panic CPU are
* finalized when we are here. The only exception
@@ -2143,10 +2143,13 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
* But it would have the sequence number returned
* by "prb_next_reserve_seq() - 1".
*/
- if (this_cpu_in_panic() && ((*seq + 1) < prb_next_reserve_seq(rb)))
+ if (this_cpu_in_panic() &&
+ (!debug_non_panic_cpus || legacy_allow_panic_sync) &&
+ ((*seq + 1) < prb_next_reserve_seq(rb))) {
(*seq)++;
- else
+ } else {
return false;
+ }
}
}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index b9b6bc55185d..aa42de4d2768 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -18,7 +18,7 @@ config TREE_RCU
config PREEMPT_RCU
bool
- default y if PREEMPTION
+ default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC)
select TREE_RCU
help
This option selects the RCU implementation that is
@@ -65,6 +65,17 @@ config TREE_SRCU
help
This option selects the full-fledged version of SRCU.
+config FORCE_NEED_SRCU_NMI_SAFE
+ bool "Force selection of NEED_SRCU_NMI_SAFE"
+ depends on !TINY_SRCU
+ select NEED_SRCU_NMI_SAFE
+ default n
+ help
+ This option forces selection of the NEED_SRCU_NMI_SAFE
+ Kconfig option, allowing testing of srcu_read_lock_nmisafe()
+ and srcu_read_unlock_nmisafe() on architectures (like x86)
+ that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option.
+
config NEED_SRCU_NMI_SAFE
def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU
@@ -91,7 +102,7 @@ config NEED_TASKS_RCU
config TASKS_RCU
bool
- default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO)
+ default NEED_TASKS_RCU && PREEMPTION
select IRQ_WORK
config FORCE_TASKS_RUDE_RCU
@@ -323,21 +334,27 @@ config RCU_LAZY
depends on RCU_NOCB_CPU
default n
help
- To save power, batch RCU callbacks and flush after delay, memory
- pressure, or callback list growing too big.
+ To save power, batch RCU callbacks and delay starting the
+ corresponding grace period for multiple seconds. The grace
+ period will be started after this delay, in case of memory
+ pressure, or if the corresponding CPU's callback list grows
+ too large.
- Requires rcu_nocbs=all to be set.
+ These delays happen only on rcu_nocbs CPUs, that is, CPUs
+ whose callbacks have been offloaded.
- Use rcutree.enable_rcu_lazy=0 to turn it off at boot time.
+ Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to
+ globally disable these delays.
config RCU_LAZY_DEFAULT_OFF
bool "Turn RCU lazy invocation off by default"
depends on RCU_LAZY
default n
help
- Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default
- off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch
- it back on.
+ Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel
+ to boot with these energy-efficiency delays disabled. Use the
+ rcutree.enable_rcu_lazy=0 kernel-boot parameter to override
+ the this option at boot time, thus re-enabling these delays.
config RCU_DOUBLE_CHECK_CB_TIME
bool "RCU callback-batch backup time check"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 6af90510a1ca..12e4c64ebae1 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -54,7 +54,7 @@ config RCU_TORTURE_TEST
Say N if you are unsure.
config RCU_TORTURE_TEST_CHK_RDR_STATE
- tristate "Check rcutorture reader state"
+ bool "Check rcutorture reader state"
depends on RCU_TORTURE_TEST
default n
help
@@ -70,7 +70,7 @@ config RCU_TORTURE_TEST_CHK_RDR_STATE
Say N if you are unsure.
config RCU_TORTURE_TEST_LOG_CPU
- tristate "Log CPU for rcutorture failures"
+ bool "Log CPU for rcutorture failures"
depends on RCU_TORTURE_TEST
default n
help
@@ -84,6 +84,20 @@ config RCU_TORTURE_TEST_LOG_CPU
Say Y here if you want CPU IDs logged.
Say N if you are unsure.
+config RCU_TORTURE_TEST_LOG_GP
+ bool "Log grace-period numbers for rcutorture failures"
+ depends on RCU_TORTURE_TEST
+ default n
+ help
+ This option causes rcutorture to decorate each entry of its
+ log of failure/close-call rcutorture reader segments with the
+ corresponding grace-period sequence numbers. This information
+ can be useful, but it does incur additional overhead, overhead
+ that can make both failures and close calls less probable.
+
+ Say Y here if you want grace-period sequence numbers logged.
+ Say N if you are unsure.
+
config RCU_REF_SCALE_TEST
tristate "Scalability tests for read-side synchronization (RCU and others)"
depends on DEBUG_KERNEL
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index feb3ac1dc5d5..eed2951a4962 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -162,7 +162,7 @@ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
{
unsigned long cur_s = READ_ONCE(*sp);
- return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1));
+ return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1));
}
/*
@@ -590,6 +590,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
#endif
static inline void rcu_gp_set_torture_wait(int duration) { }
#endif
+unsigned long long rcutorture_gather_gp_seqs(void);
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len);
#ifdef CONFIG_TINY_SRCU
@@ -611,8 +613,6 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags,
static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; }
static inline unsigned long rcu_get_gp_seq(void) { return 0; }
static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
-static inline unsigned long
-srcu_batches_completed(struct srcu_struct *sp) { return 0; }
static inline void rcu_force_quiescent_state(void) { }
static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; }
static inline void show_rcu_gp_kthreads(void) { }
@@ -624,7 +624,6 @@ static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
bool rcu_watching_zero_in_eqs(int cpu, int *vp);
unsigned long rcu_get_gp_seq(void);
unsigned long rcu_exp_batches_completed(void);
-unsigned long srcu_batches_completed(struct srcu_struct *sp);
bool rcu_check_boost_fail(unsigned long gp_state, int *cpup);
void show_rcu_gp_kthreads(void);
int rcu_get_gp_kthreads_prio(void);
@@ -636,6 +635,12 @@ void rcu_gp_slow_register(atomic_t *rgssp);
void rcu_gp_slow_unregister(atomic_t *rgssp);
#endif /* #else #ifdef CONFIG_TINY_RCU */
+#ifdef CONFIG_TINY_SRCU
+static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; }
+#else // #ifdef CONFIG_TINY_SRCU
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
+#endif // #else // #ifdef CONFIG_TINY_SRCU
+
#ifdef CONFIG_RCU_NOCB_CPU
void rcu_bind_current_to_nocb(void);
#else
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d26fb1d33ed9..65095664f5c5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -135,6 +135,7 @@ torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
+torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds.");
torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable.");
torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
@@ -147,6 +148,7 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)");
static int nrealnocbers;
static int nrealreaders;
+static int nrealfakewriters;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
@@ -272,6 +274,9 @@ struct rt_read_seg {
bool rt_preempted;
int rt_cpu;
int rt_end_cpu;
+ unsigned long long rt_gp_seq;
+ unsigned long long rt_gp_seq_end;
+ u64 rt_ts;
};
static int err_segs_recorded;
static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS];
@@ -406,6 +411,8 @@ struct rcu_torture_ops {
void (*gp_slow_register)(atomic_t *rgssp);
void (*gp_slow_unregister)(atomic_t *rgssp);
bool (*reader_blocked)(void);
+ unsigned long long (*gather_gp_seqs)(void);
+ void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len);
long cbflood_max;
int irq_capable;
int can_boost;
@@ -610,6 +617,8 @@ static struct rcu_torture_ops rcu_ops = {
.reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)
? has_rcu_reader_blocked
: NULL,
+ .gather_gp_seqs = rcutorture_gather_gp_seqs,
+ .format_gp_seqs = rcutorture_format_gp_seqs,
.irq_capable = 1,
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
@@ -655,6 +664,8 @@ static struct rcu_torture_ops rcu_busted_ops = {
.sync = synchronize_rcu_busted,
.exp_sync = synchronize_rcu_busted,
.call = call_rcu_busted,
+ .gather_gp_seqs = rcutorture_gather_gp_seqs,
+ .format_gp_seqs = rcutorture_format_gp_seqs,
.irq_capable = 1,
.extendables = RCUTORTURE_MAX_EXTEND,
.name = "busted"
@@ -677,8 +688,11 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq)
static int srcu_torture_read_lock(void)
{
int idx;
+ struct srcu_ctr __percpu *scp;
int ret = 0;
+ WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL);
+
if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) {
idx = srcu_read_lock(srcu_ctlp);
WARN_ON_ONCE(idx & ~0x1);
@@ -694,6 +708,12 @@ static int srcu_torture_read_lock(void)
WARN_ON_ONCE(idx & ~0x1);
ret += idx << 2;
}
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx << 3;
+ }
return ret;
}
@@ -719,6 +739,8 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
static void srcu_torture_read_unlock(int idx)
{
WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST)
+ srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
if (reader_flavor & SRCU_READ_FLAVOR_LITE)
srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2);
if (reader_flavor & SRCU_READ_FLAVOR_NMI)
@@ -791,6 +813,7 @@ static struct rcu_torture_ops srcu_ops = {
.readunlock = srcu_torture_read_unlock,
.readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
+ .gp_diff = rcu_seq_diff,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
@@ -834,6 +857,7 @@ static struct rcu_torture_ops srcud_ops = {
.readunlock = srcu_torture_read_unlock,
.readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
+ .gp_diff = rcu_seq_diff,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
@@ -1148,8 +1172,19 @@ static int rcu_torture_boost(void *arg)
unsigned long gp_state;
unsigned long gp_state_time;
unsigned long oldstarttime;
+ unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ;
- VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+ if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+ } else {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period");
+ while (time_before(jiffies, booststarttime)) {
+ schedule_timeout_idle(HZ);
+ if (kthread_should_stop())
+ goto cleanup;
+ }
+ VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period");
+ }
/* Set real-time priority. */
sched_set_fifo_low(current);
@@ -1225,6 +1260,7 @@ checkwait: if (stutter_wait("rcu_torture_boost"))
sched_set_fifo_low(current);
} while (!torture_must_stop());
+cleanup:
/* Clean up and exit. */
while (!kthread_should_stop()) {
torture_shutdown_absorb("rcu_torture_boost");
@@ -1728,7 +1764,7 @@ rcu_torture_fakewriter(void *arg)
do {
torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand);
if (cur_ops->cb_barrier != NULL &&
- torture_random(&rand) % (nfakewriters * 8) == 0) {
+ torture_random(&rand) % (nrealfakewriters * 8) == 0) {
cur_ops->cb_barrier();
} else {
switch (synctype[torture_random(&rand) % nsynctypes]) {
@@ -1873,6 +1909,8 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count()
static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq)
{
+ int mask;
+
if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE))
return;
@@ -1899,11 +1937,27 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old,
WARN_ONCE(cur_ops->extendables &&
!(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS);
- WARN_ONCE(cur_ops->extendables &&
- !(curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) &&
+
+ /*
+ * non-preemptible RCU in a preemptible kernel uses preempt_disable()
+ * as rcu_read_lock().
+ */
+ mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
+ mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
+
+ WARN_ONCE(cur_ops->extendables && !(curstate & mask) &&
(preempt_count() & PREEMPT_MASK), ROEC_ARGS);
- WARN_ONCE(cur_ops->readlock_nesting &&
- !(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) &&
+
+ /*
+ * non-preemptible RCU in a preemptible kernel uses "preempt_count() &
+ * PREEMPT_MASK" as ->readlock_nesting().
+ */
+ mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
+ mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+
+ WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) &&
cur_ops->readlock_nesting() > 0, ROEC_ARGS);
}
@@ -1965,6 +2019,13 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
rtrsp[-1].rt_preempted = cur_ops->reader_blocked();
}
}
+ // Sample grace-period sequence number, as good a place as any.
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) {
+ rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs();
+ rtrsp->rt_ts = ktime_get_mono_fast_ns();
+ if (!first)
+ rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq;
+ }
/*
* Next, remove old protection, in decreasing order of strength
@@ -2512,7 +2573,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"shuffle_interval=%d stutter=%d irqreader=%d "
"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
"test_boost=%d/%d test_boost_interval=%d "
- "test_boost_duration=%d shutdown_secs=%d "
+ "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d "
"stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
"stall_cpu_block=%d stall_cpu_repeat=%d "
"n_barrier_cbs=%d "
@@ -2522,11 +2583,11 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"nocbs_nthreads=%d nocbs_toggle=%d "
"test_nmis=%d "
"preempt_duration=%d preempt_interval=%d\n",
- torture_type, tag, nrealreaders, nfakewriters,
+ torture_type, tag, nrealreaders, nrealfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
test_boost, cur_ops->can_boost,
- test_boost_interval, test_boost_duration, shutdown_secs,
+ test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs,
stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
stall_cpu_block, stall_cpu_repeat,
n_barrier_cbs,
@@ -3553,6 +3614,7 @@ rcu_torture_cleanup(void)
int flags = 0;
unsigned long gp_seq = 0;
int i;
+ int j;
if (torture_cleanup_begin()) {
if (cur_ops->cb_barrier != NULL) {
@@ -3597,7 +3659,7 @@ rcu_torture_cleanup(void)
rcu_torture_reader_mbchk = NULL;
if (fakewriter_tasks) {
- for (i = 0; i < nfakewriters; i++)
+ for (i = 0; i < nrealfakewriters; i++)
torture_stop_kthread(rcu_torture_fakewriter,
fakewriter_tasks[i]);
kfree(fakewriter_tasks);
@@ -3635,7 +3697,11 @@ rcu_torture_cleanup(void)
pr_alert("\t: No segments recorded!!!\n");
firsttime = 1;
for (i = 0; i < rt_read_nsegs; i++) {
- pr_alert("\t%d: %#4x", i, err_segs[i].rt_readstate);
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP))
+ pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL));
+ else
+ pr_alert("\t");
+ pr_cont("%d: %#4x", i, err_segs[i].rt_readstate);
if (err_segs[i].rt_delay_jiffies != 0) {
pr_cont("%s%ldjiffies", firsttime ? "" : "+",
err_segs[i].rt_delay_jiffies);
@@ -3648,6 +3714,27 @@ rcu_torture_cleanup(void)
else
pr_cont(" ...");
}
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) &&
+ cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) {
+ char buf1[20+1];
+ char buf2[20+1];
+ char sepchar = '-';
+
+ cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq,
+ buf1, ARRAY_SIZE(buf1));
+ cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end,
+ buf2, ARRAY_SIZE(buf2));
+ if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) {
+ if (buf2[0]) {
+ for (j = 0; buf2[j]; j++)
+ buf2[j] = '.';
+ if (j)
+ buf2[j - 1] = ' ';
+ }
+ sepchar = ' ';
+ }
+ pr_cont(" %s%c%s", buf1, sepchar, buf2);
+ }
if (err_segs[i].rt_delay_ms != 0) {
pr_cont(" %s%ldms", firsttime ? "" : "+",
err_segs[i].rt_delay_ms);
@@ -3994,6 +4081,14 @@ rcu_torture_init(void)
rcu_torture_init_srcu_lockdep();
+ if (nfakewriters >= 0) {
+ nrealfakewriters = nfakewriters;
+ } else {
+ nrealfakewriters = num_online_cpus() - 2 - nfakewriters;
+ if (nrealfakewriters <= 0)
+ nrealfakewriters = 1;
+ }
+
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
@@ -4050,8 +4145,9 @@ rcu_torture_init(void)
writer_task);
if (torture_init_error(firsterr))
goto unwind;
- if (nfakewriters > 0) {
- fakewriter_tasks = kcalloc(nfakewriters,
+
+ if (nrealfakewriters > 0) {
+ fakewriter_tasks = kcalloc(nrealfakewriters,
sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
@@ -4060,7 +4156,7 @@ rcu_torture_init(void)
goto unwind;
}
}
- for (i = 0; i < nfakewriters; i++) {
+ for (i = 0; i < nrealfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
NULL, fakewriter_tasks[i]);
if (torture_init_error(firsterr))
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 1b47376acdc4..f11a7c2af778 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -216,6 +216,36 @@ static const struct ref_scale_ops srcu_ops = {
.name = "srcu"
};
+static void srcu_fast_ref_scale_read_section(const int nloops)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ srcu_read_unlock_fast(srcu_ctlp, scp);
+ }
+}
+
+static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock_fast(srcu_ctlp, scp);
+ }
+}
+
+static const struct ref_scale_ops srcu_fast_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = srcu_fast_ref_scale_read_section,
+ .delaysection = srcu_fast_ref_scale_delay_section,
+ .name = "srcu-fast"
+};
+
static void srcu_lite_ref_scale_read_section(const int nloops)
{
int i;
@@ -1163,7 +1193,7 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static const struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS
+ &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS
&refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops,
&acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops,
&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 4dcbf8aa80ff..6e9fe2ce1075 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -20,7 +20,11 @@
#include "rcu_segcblist.h"
#include "rcu.h"
+#ifndef CONFIG_TREE_RCU
int rcu_scheduler_active __read_mostly;
+#else // #ifndef CONFIG_TREE_RCU
+extern int rcu_scheduler_active;
+#endif // #else // #ifndef CONFIG_TREE_RCU
static LIST_HEAD(srcu_boot_list);
static bool srcu_init_done;
@@ -98,7 +102,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
int newval;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
preempt_enable();
@@ -120,7 +124,7 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) {
preempt_enable();
return; /* Already running or nothing to do. */
@@ -138,7 +142,7 @@ void srcu_drive_gp(struct work_struct *wp)
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
preempt_enable();
swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
preempt_enable();
@@ -159,7 +163,7 @@ void srcu_drive_gp(struct work_struct *wp)
* at interrupt level, but the ->srcu_gp_running checks will
* straighten that out.
*/
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
WRITE_ONCE(ssp->srcu_gp_running, false);
idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max));
preempt_enable();
@@ -172,7 +176,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
{
unsigned long cookie;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
cookie = get_state_synchronize_srcu(ssp);
if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) {
preempt_enable();
@@ -199,7 +203,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rhp->func = func;
rhp->next = NULL;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
local_irq_save(flags);
*ssp->srcu_cb_tail = rhp;
ssp->srcu_cb_tail = &rhp->next;
@@ -261,7 +265,7 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
{
unsigned long ret;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
ret = get_state_synchronize_srcu(ssp);
srcu_gp_start_if_needed(ssp);
preempt_enable();
@@ -282,11 +286,13 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+#ifndef CONFIG_TREE_RCU
/* Lockdep diagnostics. */
void __init rcu_scheduler_starting(void)
{
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
}
+#endif // #ifndef CONFIG_TREE_RCU
/*
* Queue work for srcu_struct structures with early boot callbacks.
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index b83c74c4dcc0..d2a694944553 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -116,8 +116,9 @@ do { \
/*
* Initialize SRCU per-CPU data. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
- * srcu_read_unlock() running against them. So if the is_static parameter
- * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
+ * srcu_read_unlock() running against them. So if the is_static
+ * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and
+ * ->srcu_ctrs[].srcu_unlocks.
*/
static void init_srcu_struct_data(struct srcu_struct *ssp)
{
@@ -128,8 +129,6 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
* Initialize the per-CPU srcu_data array, which feeds into the
* leaves of the srcu_node tree.
*/
- BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) !=
- ARRAY_SIZE(sdp->srcu_unlock_count));
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
@@ -247,15 +246,16 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
ssp->srcu_sup->node = NULL;
mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
- ssp->srcu_idx = 0;
ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL;
ssp->srcu_sup->srcu_barrier_seq = 0;
mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
ssp->srcu_sup->sda_is_static = is_static;
- if (!is_static)
+ if (!is_static) {
ssp->sda = alloc_percpu(struct srcu_data);
+ ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0];
+ }
if (!ssp->sda)
goto err_free_sup;
init_srcu_struct_data(ssp);
@@ -429,10 +429,10 @@ static bool srcu_gp_is_expedited(struct srcu_struct *ssp)
}
/*
- * Computes approximate total of the readers' ->srcu_lock_count[] values
- * for the rank of per-CPU counters specified by idx, and returns true if
- * the caller did the proper barrier (gp), and if the count of the locks
- * matches that of the unlocks passed in.
+ * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks
+ * values for the rank of per-CPU counters specified by idx, and returns
+ * true if the caller did the proper barrier (gp), and if the count of
+ * the locks matches that of the unlocks passed in.
*/
static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks)
{
@@ -443,20 +443,20 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&sdp->srcu_lock_count[idx]);
+ sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks);
if (IS_ENABLED(CONFIG_PROVE_RCU))
mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
}
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
"Mixed reader flavors for srcu_struct at %ps.\n", ssp);
- if (mask & SRCU_READ_FLAVOR_LITE && !gp)
+ if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp)
return false;
return sum == unlocks;
}
/*
- * Returns approximate total of the readers' ->srcu_unlock_count[] values
- * for the rank of per-CPU counters specified by idx.
+ * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks
+ * values for the rank of per-CPU counters specified by idx.
*/
static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm)
{
@@ -467,7 +467,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, u
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&sdp->srcu_unlock_count[idx]);
+ sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks);
mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
}
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
@@ -487,7 +487,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
unsigned long unlocks;
unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm);
- did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE);
+ did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP);
/*
* Make sure that a lock is always counted if the corresponding
@@ -509,48 +509,49 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
* If the locks are the same as the unlocks, then there must have
* been no readers on this index at some point in this function.
* But there might be more readers, as a task might have read
- * the current ->srcu_idx but not yet have incremented its CPU's
- * ->srcu_lock_count[idx] counter. In fact, it is possible
+ * the current ->srcu_ctrp but not yet have incremented its CPU's
+ * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible
* that most of the tasks have been preempted between fetching
- * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there
- * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks
- * in a system whose address space was fully populated with memory.
- * Call this quantity Nt.
+ * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And
+ * there could be almost (ULONG_MAX / sizeof(struct task_struct))
+ * tasks in a system whose address space was fully populated
+ * with memory. Call this quantity Nt.
*
- * So suppose that the updater is preempted at this point in the
- * code for a long time. That now-preempted updater has already
- * flipped ->srcu_idx (possibly during the preceding grace period),
- * done an smp_mb() (again, possibly during the preceding grace
- * period), and summed up the ->srcu_unlock_count[idx] counters.
- * How many times can a given one of the aforementioned Nt tasks
- * increment the old ->srcu_idx value's ->srcu_lock_count[idx]
- * counter, in the absence of nesting?
+ * So suppose that the updater is preempted at this
+ * point in the code for a long time. That now-preempted
+ * updater has already flipped ->srcu_ctrp (possibly during
+ * the preceding grace period), done an smp_mb() (again,
+ * possibly during the preceding grace period), and summed up
+ * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times
+ * can a given one of the aforementioned Nt tasks increment the
+ * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter,
+ * in the absence of nesting?
*
* It can clearly do so once, given that it has already fetched
- * the old value of ->srcu_idx and is just about to use that value
- * to index its increment of ->srcu_lock_count[idx]. But as soon as
- * it leaves that SRCU read-side critical section, it will increment
- * ->srcu_unlock_count[idx], which must follow the updater's above
- * read from that same value. Thus, as soon the reading task does
- * an smp_mb() and a later fetch from ->srcu_idx, that task will be
- * guaranteed to get the new index. Except that the increment of
- * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the
- * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock()
- * is before the smp_mb(). Thus, that task might not see the new
- * value of ->srcu_idx until the -second- __srcu_read_lock(),
- * which in turn means that this task might well increment
- * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice,
- * not just once.
+ * the old value of ->srcu_ctrp and is just about to use that
+ * value to index its increment of ->srcu_ctrs[idx].srcu_locks.
+ * But as soon as it leaves that SRCU read-side critical section,
+ * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must
+ * follow the updater's above read from that same value. Thus,
+ as soon the reading task does an smp_mb() and a later fetch from
+ * ->srcu_ctrp, that task will be guaranteed to get the new index.
+ * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks
+ * in __srcu_read_unlock() is after the smp_mb(), and the fetch
+ * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb().
+ * Thus, that task might not see the new value of ->srcu_ctrp until
+ * the -second- __srcu_read_lock(), which in turn means that this
+ * task might well increment ->srcu_ctrs[idx].srcu_locks for the
+ * old value of ->srcu_ctrp twice, not just once.
*
* However, it is important to note that a given smp_mb() takes
* effect not just for the task executing it, but also for any
* later task running on that same CPU.
*
- * That is, there can be almost Nt + Nc further increments of
- * ->srcu_lock_count[idx] for the old index, where Nc is the number
- * of CPUs. But this is OK because the size of the task_struct
- * structure limits the value of Nt and current systems limit Nc
- * to a few thousand.
+ * That is, there can be almost Nt + Nc further increments
+ * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc
+ * is the number of CPUs. But this is OK because the size of
+ * the task_struct structure limits the value of Nt and current
+ * systems limit Nc to a few thousand.
*
* OK, but what about nesting? This does impose a limit on
* nesting of half of the size of the task_struct structure
@@ -581,10 +582,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&sdp->srcu_lock_count[0]);
- sum += atomic_long_read(&sdp->srcu_lock_count[1]);
- sum -= atomic_long_read(&sdp->srcu_unlock_count[0]);
- sum -= atomic_long_read(&sdp->srcu_unlock_count[1]);
+ sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks);
+ sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks);
+ sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks);
+ sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks);
}
return sum;
}
@@ -647,6 +648,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
unsigned long jbase = SRCU_INTERVAL;
struct srcu_usage *sup = ssp->srcu_sup;
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
if (srcu_gp_is_expedited(ssp))
jbase = 0;
if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
@@ -674,9 +676,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
+ unsigned long delay;
struct srcu_usage *sup = ssp->srcu_sup;
- if (WARN_ON(!srcu_get_delay(ssp)))
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ delay = srcu_get_delay(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ if (WARN_ON(!delay))
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
@@ -743,12 +749,11 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor);
*/
int __srcu_read_lock(struct srcu_struct *ssp)
{
- int idx;
+ struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
- idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
+ this_cpu_inc(scp->srcu_locks.counter);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
- return idx;
+ return __srcu_ptr_to_ctr(ssp, scp);
}
EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -760,7 +765,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);
+ this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -773,13 +778,12 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
*/
int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
{
- int idx;
- struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+ struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp);
+ struct srcu_ctr *scp = raw_cpu_ptr(scpp);
- idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- atomic_long_inc(&sdp->srcu_lock_count[idx]);
+ atomic_long_inc(&scp->srcu_locks);
smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */
- return idx;
+ return __srcu_ptr_to_ctr(ssp, scpp);
}
EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
@@ -790,10 +794,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
*/
void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
{
- struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
-
smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */
- atomic_long_inc(&sdp->srcu_unlock_count[idx]);
+ atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
@@ -1096,13 +1098,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
/*
* Wait until all readers counted by array index idx complete, but
* loop an additional time if there is an expedited grace period pending.
- * The caller must ensure that ->srcu_idx is not changed while checking.
+ * The caller must ensure that ->srcu_ctrp is not changed while checking.
*/
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
unsigned long curdelay;
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
curdelay = !srcu_get_delay(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
for (;;) {
if (srcu_readers_active_idx_check(ssp, idx))
@@ -1114,30 +1118,30 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
}
/*
- * Increment the ->srcu_idx counter so that future SRCU readers will
+ * Increment the ->srcu_ctrp counter so that future SRCU readers will
* use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
* us to wait for pre-existing readers in a starvation-free manner.
*/
static void srcu_flip(struct srcu_struct *ssp)
{
/*
- * Because the flip of ->srcu_idx is executed only if the
+ * Because the flip of ->srcu_ctrp is executed only if the
* preceding call to srcu_readers_active_idx_check() found that
- * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched
- * and because that summing uses atomic_long_read(), there is
- * ordering due to a control dependency between that summing and
- * the WRITE_ONCE() in this call to srcu_flip(). This ordering
- * ensures that if this updater saw a given reader's increment from
- * __srcu_read_lock(), that reader was using a value of ->srcu_idx
- * from before the previous call to srcu_flip(), which should be
- * quite rare. This ordering thus helps forward progress because
- * the grace period could otherwise be delayed by additional
- * calls to __srcu_read_lock() using that old (soon to be new)
- * value of ->srcu_idx.
+ * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums
+ * matched and because that summing uses atomic_long_read(),
+ * there is ordering due to a control dependency between that
+ * summing and the WRITE_ONCE() in this call to srcu_flip().
+ * This ordering ensures that if this updater saw a given reader's
+ * increment from __srcu_read_lock(), that reader was using a value
+ * of ->srcu_ctrp from before the previous call to srcu_flip(),
+ * which should be quite rare. This ordering thus helps forward
+ * progress because the grace period could otherwise be delayed
+ * by additional calls to __srcu_read_lock() using that old (soon
+ * to be new) value of ->srcu_ctrp.
*
* This sum-equality check and ordering also ensures that if
* a given call to __srcu_read_lock() uses the new value of
- * ->srcu_idx, this updater's earlier scans cannot have seen
+ * ->srcu_ctrp, this updater's earlier scans cannot have seen
* that reader's increments, which is all to the good, because
* this grace period need not wait on that reader. After all,
* if those earlier scans had seen that reader, there would have
@@ -1152,7 +1156,8 @@ static void srcu_flip(struct srcu_struct *ssp)
*/
smp_mb(); /* E */ /* Pairs with B and C. */
- WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter.
+ WRITE_ONCE(ssp->srcu_ctrp,
+ &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]);
/*
* Ensure that if the updater misses an __srcu_read_unlock()
@@ -1198,7 +1203,7 @@ static bool srcu_should_expedite(struct srcu_struct *ssp)
check_init_srcu_struct(ssp);
/* If _lite() readers, don't do unsolicited expediting. */
- if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)
+ if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP)
return false;
/* If the local srcu_data structure has callbacks, not idle. */
sdp = raw_cpu_ptr(ssp->sda);
@@ -1398,8 +1403,12 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
* read-side critical sections are delimited by srcu_read_lock() and
* srcu_read_unlock(), and may be nested.
*
- * The callback will be invoked from process context, but must nevertheless
- * be fast and must not block.
+ * The callback will be invoked from process context, but with bh
+ * disabled. The callback function must therefore be fast and must
+ * not block.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func)
@@ -1465,8 +1474,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
*
* Wait for the count to drain to zero of both indexes. To avoid the
* possible starvation of synchronize_srcu(), it waits for the count of
- * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
- * and then flip the srcu_idx and wait for the count of the other index.
+ * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero
+ * at first, and then flip the ->srcu_ctrp and wait for the count of the
+ * other index.
*
* Can block; must be called from process context.
*
@@ -1675,7 +1685,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
*/
unsigned long srcu_batches_completed(struct srcu_struct *ssp)
{
- return READ_ONCE(ssp->srcu_idx);
+ return READ_ONCE(ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(srcu_batches_completed);
@@ -1692,7 +1702,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
/*
* Because readers might be delayed for an extended period after
- * fetching ->srcu_idx for their index, at any point in time there
+ * fetching ->srcu_ctrp for their index, at any point in time there
* might well be readers using both idx=0 and idx=1. We therefore
* need to wait for readers to clear from both index values before
* invoking a callback.
@@ -1720,7 +1730,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
}
if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
- idx = 1 ^ (ssp->srcu_idx & 1);
+ idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]);
if (!try_check_zero(ssp, idx, 1)) {
mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
@@ -1738,7 +1748,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
* SRCU read-side critical sections are normally short,
* so check at least twice in quick succession after a flip.
*/
- idx = 1 ^ (ssp->srcu_idx & 1);
+ idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]);
if (!try_check_zero(ssp, idx, 2)) {
mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
@@ -1849,7 +1859,9 @@ static void process_srcu(struct work_struct *work)
ssp = sup->srcu_ssp;
srcu_advance_state(ssp);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
curdelay = srcu_get_delay(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (curdelay) {
WRITE_ONCE(sup->reschedule_count, 0);
} else {
@@ -1896,7 +1908,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
int ss_state_idx = ss_state;
- idx = ssp->srcu_idx & 0x1;
+ idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0];
if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
@@ -1914,8 +1926,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
struct srcu_data *sdp;
sdp = per_cpu_ptr(ssp->sda, cpu);
- u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx]));
- u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx]));
+ u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks));
+ u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks));
/*
* Make sure that a lock is always counted if the corresponding
@@ -1923,8 +1935,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
*/
smp_rmb();
- l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx]));
- l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx]));
+ l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks));
+ l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks));
c0 = l0 - u0;
c1 = l1 - u1;
@@ -2001,6 +2013,7 @@ static int srcu_module_coming(struct module *mod)
ssp->sda = alloc_percpu(struct srcu_data);
if (WARN_ON_ONCE(!ssp->sda))
return -ENOMEM;
+ ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0];
}
return 0;
}
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 59314da5eb60..466668eb4fad 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -2256,7 +2256,7 @@ void __init tasks_cblist_init_generic(void)
#endif
}
-void __init rcu_init_tasks_generic(void)
+static int __init rcu_init_tasks_generic(void)
{
#ifdef CONFIG_TASKS_RCU
rcu_spawn_tasks_kthread();
@@ -2272,7 +2272,10 @@ void __init rcu_init_tasks_generic(void)
// Run the self-tests.
rcu_tasks_initiate_self_tests();
+
+ return 0;
}
+core_initcall(rcu_init_tasks_generic);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 4b3f31911465..c1ebfd51768b 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -85,15 +85,8 @@ void rcu_sched_clock_irq(int user)
static inline bool rcu_reclaim_tiny(struct rcu_head *head)
{
rcu_callback_t f;
- unsigned long offset = (unsigned long)head->func;
rcu_lock_acquire(&rcu_callback_map);
- if (__is_kvfree_rcu_offset(offset)) {
- trace_rcu_invoke_kvfree_callback("", head, offset);
- kvfree((void *)head - offset);
- rcu_lock_release(&rcu_callback_map);
- return true;
- }
trace_rcu_invoke_callback("", head);
f = head->func;
@@ -159,10 +152,6 @@ void synchronize_rcu(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
-static void tiny_rcu_leak_callback(struct rcu_head *rhp)
-{
-}
-
/*
* Post an RCU callback to be invoked after the end of an RCU grace
* period. But since we have but one CPU, that would be after any
@@ -178,9 +167,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
mem_dump_obj(head);
}
-
- if (!__is_kvfree_rcu_offset((unsigned long)head->func))
- WRITE_ONCE(head->func, tiny_rcu_leak_callback);
return;
}
@@ -246,15 +232,18 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
-#ifdef CONFIG_KASAN_GENERIC
-void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST)
+unsigned long long rcutorture_gather_gp_seqs(void)
{
- if (head)
- kasan_record_aux_stack(ptr);
+ return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL;
+}
+EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs);
- __kvfree_call_rcu(head, ptr);
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len)
+{
+ snprintf(cp, len, "g%04llx", seqs & 0xffffULL);
}
-EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs);
#endif
void __init rcu_init(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 475f31deed14..659f83e71048 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -538,6 +538,26 @@ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
+/* Gather grace-period sequence numbers for rcutorture diagnostics. */
+unsigned long long rcutorture_gather_gp_seqs(void)
+{
+ return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) |
+ ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) |
+ (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL);
+}
+EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs);
+
+/* Format grace-period sequence numbers for rcutorture diagnostics. */
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len)
+{
+ unsigned int egp = (seqs >> 16) & 0xffffffULL;
+ unsigned int ggp = (seqs >> 40) & 0xffffULL;
+ unsigned int pgp = seqs & 0xffffULL;
+
+ snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp);
+}
+EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs);
+
#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
/*
* An empty function that will trigger a reschedule on
@@ -1254,7 +1274,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Handle the ends of any preceding grace periods first. */
if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
- unlikely(READ_ONCE(rdp->gpwrap))) {
+ unlikely(rdp->gpwrap)) {
if (!offloaded)
ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
rdp->core_needs_qs = false;
@@ -1268,7 +1288,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Now handle the beginnings of any new-to-this-CPU grace periods. */
if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) ||
- unlikely(READ_ONCE(rdp->gpwrap))) {
+ unlikely(rdp->gpwrap)) {
/*
* If the current grace period is waiting for this CPU,
* set up to detect a quiescent state, otherwise don't
@@ -1283,7 +1303,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
- if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
+ if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap)
WRITE_ONCE(rdp->last_sched_clock, jiffies);
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
@@ -1612,12 +1632,10 @@ static void rcu_sr_normal_complete(struct llist_node *node)
{
struct rcu_synchronize *rs = container_of(
(struct rcu_head *) node, struct rcu_synchronize, head);
- unsigned long oldstate = (unsigned long) rs->head.func;
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
- !poll_state_synchronize_rcu(oldstate),
- "A full grace period is not passed yet: %lu",
- rcu_seq_diff(get_state_synchronize_rcu(), oldstate));
+ !poll_state_synchronize_rcu_full(&rs->oldstate),
+ "A full grace period is not passed yet!\n");
/* Finally. */
complete(&rs->completion);
@@ -1801,10 +1819,14 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Advance to a new grace period and initialize state. */
record_gp_stall_check_time();
+ /*
+ * A new wait segment must be started before gp_seq advanced, so
+ * that previous gp waiters won't observe the new gp_seq.
+ */
+ start_new_poll = rcu_sr_normal_gp_init();
/* Record GP times before starting GP, hence rcu_seq_start(). */
rcu_seq_start(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
- start_new_poll = rcu_sr_normal_gp_init();
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
@@ -2931,13 +2953,8 @@ static int __init rcu_spawn_core_kthreads(void)
static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func)
{
rcu_segcblist_enqueue(&rdp->cblist, head);
- if (__is_kvfree_rcu_offset((unsigned long)func))
- trace_rcu_kvfree_callback(rcu_state.name, head,
- (unsigned long)func,
- rcu_segcblist_n_cbs(&rdp->cblist));
- else
- trace_rcu_callback(rcu_state.name, head,
- rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_callback(rcu_state.name, head,
+ rcu_segcblist_n_cbs(&rdp->cblist));
trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
}
@@ -3107,7 +3124,7 @@ module_param(enable_rcu_lazy, bool, 0444);
* critical sections have completed.
*
* Use this API instead of call_rcu() if you don't want the callback to be
- * invoked after very long periods of time, which can happen on systems without
+ * delayed for very long periods of time, which can happen on systems without
* memory pressure and on systems which are lightly loaded or mostly idle.
* This function will cause callbacks to be invoked sooner than later at the
* expense of extra power. Other than that, this function is identical to, and
@@ -3138,6 +3155,12 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
* might well execute concurrently with RCU read-side critical sections
* that started after call_rcu() was invoked.
*
+ * It is perfectly legal to repost an RCU callback, potentially with
+ * a different callback function, from within its callback function.
+ * The specified function will be invoked after another full grace period
+ * has elapsed. This use case is similar in form to the common practice
+ * of reposting a timer from within its own handler.
+ *
* RCU read-side critical sections are delimited by rcu_read_lock()
* and rcu_read_unlock(), and may be nested. In addition, but only in
* v5.0 and later, regions of code across which interrupts, preemption,
@@ -3166,6 +3189,13 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
*
* Implementation of these memory-ordering guarantees is described here:
* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
+ *
+ * Specific to call_rcu() (as opposed to the other call_rcu*() functions),
+ * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many
+ * seconds before starting the grace period needed by the corresponding
+ * callback. This delay can significantly improve energy-efficiency
+ * on low-utilization battery-powered devices. To avoid this delay,
+ * in latency-sensitive kernel code, use call_rcu_hurry().
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
@@ -3214,7 +3244,7 @@ static void synchronize_rcu_normal(void)
* snapshot before adding a request.
*/
if (IS_ENABLED(CONFIG_PROVE_RCU))
- rs.head.func = (void *) get_state_synchronize_rcu();
+ get_state_synchronize_rcu_full(&rs.oldstate);
rcu_sr_normal_add_req(&rs);
@@ -3357,14 +3387,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
*/
void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
{
- struct rcu_node *rnp = rcu_get_root();
-
/*
* Any prior manipulation of RCU-protected data must happen
* before the loads from ->gp_seq and ->expedited_sequence.
*/
smp_mb(); /* ^^^ */
- rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
+
+ // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use
+ // in poll_state_synchronize_rcu_full() notwithstanding. Use of
+ // the latter here would result in too-short grace periods due to
+ // interactions with newly onlined CPUs.
+ rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq);
rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 77efed89c79e..8d4895c854c5 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -230,17 +230,19 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
* specified leaf rcu_node structure, which is acquired by the caller.
*/
static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags,
- unsigned long mask, bool wake)
+ unsigned long mask_in, bool wake)
__releases(rnp->lock)
{
int cpu;
+ unsigned long mask;
struct rcu_data *rdp;
raw_lockdep_assert_held_rcu_node(rnp);
- if (!(rnp->expmask & mask)) {
+ if (!(rnp->expmask & mask_in)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
+ mask = mask_in & rnp->expmask;
WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 2605dd234a13..5ff3bc56ff51 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1557,8 +1557,11 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
/* Dump out nocb kthread state for the specified rcu_data structure. */
static void show_rcu_nocb_state(struct rcu_data *rdp)
{
- char bufw[20];
- char bufr[20];
+ char bufd[22];
+ char bufw[45];
+ char bufr[45];
+ char bufn[22];
+ char bufb[22];
struct rcu_data *nocb_next_rdp;
struct rcu_segcblist *rsclp = &rdp->cblist;
bool waslocked;
@@ -1572,9 +1575,13 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
typeof(*rdp),
nocb_entry_rdp);
- sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
- sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
- pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]);
+ sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL],
+ rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]);
+ sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n",
rdp->cpu, rdp->nocb_gp_rdp->cpu,
nocb_next_rdp ? nocb_next_rdp->cpu : -1,
"kK"[!!rdp->nocb_cb_kthread],
@@ -1586,12 +1593,15 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
jiffies - rdp->nocb_nobypass_last,
rdp->nocb_nobypass_count,
".D"[rcu_segcblist_ready_cbs(rsclp)],
+ rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd,
".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn,
".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb,
rcu_segcblist_n_cbs(&rdp->cblist),
rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3600152b858e..3c0bbbbb686f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -833,8 +833,17 @@ void rcu_read_unlock_strict(void)
{
struct rcu_data *rdp;
- if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
+ if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread)
return;
+
+ /*
+ * rcu_report_qs_rdp() can only be invoked with a stable rdp and
+ * from the local CPU.
+ *
+ * The in_atomic_preempt_off() check ensures that we come here holding
+ * the last preempt_count (which will get dropped once we return to
+ * __rcu_read_unlock().
+ */
rdp = this_cpu_ptr(&rcu_data);
rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
@@ -975,13 +984,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
*/
static void rcu_flavor_sched_clock_irq(int user)
{
- if (user || rcu_is_cpu_rrupt_from_idle()) {
+ if (user || rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) &&
+ (preempt_count() == HARDIRQ_OFFSET))) {
/*
* Get here if this CPU took its interrupt from user
- * mode or from the idle loop, and if this is not a
- * nested interrupt. In this case, the CPU is in
- * a quiescent state, so note it.
+ * mode, from the idle loop without this being a nested
+ * interrupt, or while not holding the task preempt count
+ * (with PREEMPT_COUNT=y). In this case, the CPU is in a
+ * quiescent state, so note it.
*
* No memory barrier is required here because rcu_qs()
* references only CPU-local variables that other CPUs
diff --git a/kernel/reboot.c b/kernel/reboot.c
index b5a8569e5d81..41ab9e1ba357 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -704,6 +704,7 @@ void kernel_power_off(void)
migrate_to_reboot_cpu();
syscore_shutdown();
pr_emerg("Power down\n");
+ pr_flush(1000, true);
kmsg_dump(KMSG_DUMP_SHUTDOWN);
machine_power_off();
}
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 2cb16091ec0a..b7a1ec327e81 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -78,24 +78,24 @@ efault:
return -EFAULT;
}
-static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
- u32 node_id, u32 mm_cid)
-{
- rseq_kernel_fields(t)->cpu_id_start = cpu_id;
- rseq_kernel_fields(t)->cpu_id = cpu_id;
- rseq_kernel_fields(t)->node_id = node_id;
- rseq_kernel_fields(t)->mm_cid = mm_cid;
-}
+/*
+ * Update an rseq field and its in-kernel copy in lock-step to keep a coherent
+ * state.
+ */
+#define rseq_unsafe_put_user(t, value, field, error_label) \
+ do { \
+ unsafe_put_user(value, &t->rseq->field, error_label); \
+ rseq_kernel_fields(t)->field = value; \
+ } while (0)
+
#else
static int rseq_validate_ro_fields(struct task_struct *t)
{
return 0;
}
-static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
- u32 node_id, u32 mm_cid)
-{
-}
+#define rseq_unsafe_put_user(t, value, field, error_label) \
+ unsafe_put_user(value, &t->rseq->field, error_label)
#endif
/*
@@ -173,17 +173,18 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
WARN_ON_ONCE((int) mm_cid < 0);
if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
- unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
- unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
- unsafe_put_user(node_id, &rseq->node_id, efault_end);
- unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end);
+
+ rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end);
+ rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
+ rseq_unsafe_put_user(t, node_id, node_id, efault_end);
+ rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
+
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally updated only if
* t->rseq_len != ORIG_RSEQ_SIZE.
*/
user_write_access_end();
- rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid);
trace_rseq_update(t);
return 0;
@@ -195,6 +196,7 @@ efault:
static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
{
+ struct rseq __user *rseq = t->rseq;
u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
mm_cid = 0;
@@ -202,40 +204,61 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
* Validate read-only rseq fields.
*/
if (rseq_validate_ro_fields(t))
- return -EFAULT;
- /*
- * Reset cpu_id_start to its initial state (0).
- */
- if (put_user(cpu_id_start, &t->rseq->cpu_id_start))
- return -EFAULT;
- /*
- * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming
- * in after unregistration can figure out that rseq needs to be
- * registered again.
- */
- if (put_user(cpu_id, &t->rseq->cpu_id))
- return -EFAULT;
- /*
- * Reset node_id to its initial state (0).
- */
- if (put_user(node_id, &t->rseq->node_id))
- return -EFAULT;
+ goto efault;
+
+ if (!user_write_access_begin(rseq, t->rseq_len))
+ goto efault;
+
/*
- * Reset mm_cid to its initial state (0).
+ * Reset all fields to their initial state.
+ *
+ * All fields have an initial state of 0 except cpu_id which is set to
+ * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after
+ * unregistration can figure out that rseq needs to be registered
+ * again.
*/
- if (put_user(mm_cid, &t->rseq->mm_cid))
- return -EFAULT;
-
- rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid);
+ rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end);
+ rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
+ rseq_unsafe_put_user(t, node_id, node_id, efault_end);
+ rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally reset only if
* t->rseq_len != ORIG_RSEQ_SIZE.
*/
+ user_write_access_end();
+ return 0;
+
+efault_end:
+ user_write_access_end();
+efault:
+ return -EFAULT;
+}
+
+/*
+ * Get the user-space pointer value stored in the 'rseq_cs' field.
+ */
+static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs)
+{
+ if (!rseq_cs)
+ return -EFAULT;
+
+#ifdef CONFIG_64BIT
+ if (get_user(*rseq_cs, &rseq->rseq_cs))
+ return -EFAULT;
+#else
+ if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs)))
+ return -EFAULT;
+#endif
+
return 0;
}
+/*
+ * If the rseq_cs field of 'struct rseq' contains a valid pointer to
+ * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
+ */
static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
{
struct rseq_cs __user *urseq_cs;
@@ -244,17 +267,16 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
u32 sig;
int ret;
-#ifdef CONFIG_64BIT
- if (get_user(ptr, &t->rseq->rseq_cs))
- return -EFAULT;
-#else
- if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr)))
- return -EFAULT;
-#endif
+ ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr);
+ if (ret)
+ return ret;
+
+ /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
if (!ptr) {
memset(rseq_cs, 0, sizeof(*rseq_cs));
return 0;
}
+ /* Check that the pointer value fits in the user-space process space. */
if (ptr >= TASK_SIZE)
return -EINVAL;
urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
@@ -330,7 +352,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
return !!event_mask;
}
-static int clear_rseq_cs(struct task_struct *t)
+static int clear_rseq_cs(struct rseq __user *rseq)
{
/*
* The rseq_cs field is set to NULL on preemption or signal
@@ -341,9 +363,9 @@ static int clear_rseq_cs(struct task_struct *t)
* Set rseq_cs to NULL.
*/
#ifdef CONFIG_64BIT
- return put_user(0UL, &t->rseq->rseq_cs);
+ return put_user(0UL, &rseq->rseq_cs);
#else
- if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs)))
+ if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs)))
return -EFAULT;
return 0;
#endif
@@ -375,11 +397,11 @@ static int rseq_ip_fixup(struct pt_regs *regs)
* Clear the rseq_cs pointer and return.
*/
if (!in_rseq_cs(ip, &rseq_cs))
- return clear_rseq_cs(t);
+ return clear_rseq_cs(t->rseq);
ret = rseq_need_restart(t, rseq_cs.flags);
if (ret <= 0)
return ret;
- ret = clear_rseq_cs(t);
+ ret = clear_rseq_cs(t->rseq);
if (ret)
return ret;
trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
@@ -453,6 +475,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
int, flags, u32, sig)
{
int ret;
+ u64 rseq_cs;
if (flags & RSEQ_FLAG_UNREGISTER) {
if (flags & ~RSEQ_FLAG_UNREGISTER)
@@ -507,6 +530,19 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
+
+ /*
+ * If the rseq_cs pointer is non-NULL on registration, clear it to
+ * avoid a potential segfault on return to user-space. The proper thing
+ * to do would have been to fail the registration but this would break
+ * older libcs that reuse the rseq area for new threads without
+ * clearing the fields.
+ */
+ if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs))
+ return -EFAULT;
+ if (rseq_cs && clear_rseq_cs(rseq))
+ return -EFAULT;
+
#ifdef CONFIG_DEBUG_RSEQ
/*
* Initialize the in-kernel rseq fields copy for validation of
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 976092b7bd45..8ae86371ddcd 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -22,6 +22,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
+# Branch profiling isn't noinstr-safe
+ifdef CONFIG_TRACE_BRANCH_PROFILING
+CFLAGS_build_policy.o += -DDISABLE_BRANCH_PROFILING
+CFLAGS_build_utility.o += -DDISABLE_BRANCH_PROFILING
+endif
#
# Build efficiency:
#
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index fae1f5c921eb..72d97aa8b726 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -61,6 +61,7 @@
#ifdef CONFIG_SCHED_CLASS_EXT
# include "ext.c"
+# include "ext_idle.c"
#endif
#include "syscalls.c"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index 80a3df49ab47..bf9d8db94b70 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -68,9 +68,7 @@
# include "cpufreq_schedutil.c"
#endif
-#ifdef CONFIG_SCHED_DEBUG
-# include "debug.c"
-#endif
+#include "debug.c"
#ifdef CONFIG_SCHEDSTATS
# include "stats.c"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 67189907214d..cfaca3040b2f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -91,7 +91,6 @@
#include "autogroup.h"
#include "pelt.h"
#include "smp.h"
-#include "stats.h"
#include "../workqueue_internal.h"
#include "../../io_uring/io-wq.h"
@@ -119,7 +118,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#ifdef CONFIG_SCHED_DEBUG
/*
* Debugging: various feature bits
*
@@ -129,7 +127,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
*/
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
-const_debug unsigned int sysctl_sched_features =
+__read_mostly unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
@@ -143,13 +141,12 @@ const_debug unsigned int sysctl_sched_features =
*/
__read_mostly int sysctl_resched_latency_warn_ms = 100;
__read_mostly int sysctl_resched_latency_warn_once = 1;
-#endif /* CONFIG_SCHED_DEBUG */
/*
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
-const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
+__read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
@@ -491,6 +488,16 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
#endif /* CONFIG_SCHED_CORE */
+/* need a wrapper since we may need to trace from modules */
+EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
+
+/* Call via the helper macro trace_set_current_state. */
+void __trace_set_current_state(int state_value)
+{
+ trace_sched_set_state_tp(current, state_value);
+}
+EXPORT_SYMBOL(__trace_set_current_state);
+
/*
* Serialization rules:
*
@@ -800,11 +807,10 @@ void update_rq_clock(struct rq *rq)
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
-#ifdef CONFIG_SCHED_DEBUG
if (sched_feat(WARN_DOUBLE_CLOCK))
- SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
+ WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
-#endif
+
clock = sched_clock_cpu(cpu_of(rq));
scx_rq_clock_update(rq, clock);
@@ -916,8 +922,7 @@ static void hrtick_rq_init(struct rq *rq)
#ifdef CONFIG_SMP
INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
#endif
- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- rq->hrtick_timer.function = hrtick;
+ hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
}
#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
@@ -1720,7 +1725,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
bucket = &uc_rq->bucket[uc_se->bucket_id];
- SCHED_WARN_ON(!bucket->tasks);
+ WARN_ON_ONCE(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
@@ -1740,7 +1745,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
* Defensive programming: this should never happen. If it happens,
* e.g. due to future modification, warn and fix up the expected value.
*/
- SCHED_WARN_ON(bucket->value > rq_clamp);
+ WARN_ON_ONCE(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
uclamp_rq_set(rq, clamp_id, bkt_clamp);
@@ -1757,7 +1762,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
* The condition is constructed such that a NOP is generated when
* sched_uclamp_used is disabled.
*/
- if (!static_branch_unlikely(&sched_uclamp_used))
+ if (!uclamp_is_used())
return;
if (unlikely(!p->sched_class->uclamp_enabled))
@@ -1784,7 +1789,7 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
* The condition is constructed such that a NOP is generated when
* sched_uclamp_used is disabled.
*/
- if (!static_branch_unlikely(&sched_uclamp_used))
+ if (!uclamp_is_used())
return;
if (unlikely(!p->sched_class->uclamp_enabled))
@@ -1942,12 +1947,12 @@ static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write,
}
if (update_root_tg) {
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
uclamp_update_root_tg();
}
if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
uclamp_sync_util_min_rt_default();
}
@@ -2122,7 +2127,7 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
+ WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
ASSERT_EXCLUSIVE_WRITER(p->on_rq);
@@ -2727,7 +2732,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
* XXX do further audits, this smells like something putrid.
*/
if (ctx->flags & SCA_MIGRATE_DISABLE)
- SCHED_WARN_ON(!p->on_cpu);
+ WARN_ON_ONCE(!p->on_cpu);
else
lockdep_assert_held(&p->pi_lock);
@@ -3292,7 +3297,6 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
-#ifdef CONFIG_SCHED_DEBUG
unsigned int state = READ_ONCE(p->__state);
/*
@@ -3330,7 +3334,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(!cpu_online(new_cpu));
WARN_ON_ONCE(is_migration_disabled(p));
-#endif
trace_sched_migrate_task(p, new_cpu);
@@ -3922,13 +3925,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
- /*
- * The BPF scheduler may depend on select_task_rq() being invoked during
- * wakeups. In addition, @p may end up executing on a different CPU
- * regardless of what happens in the wakeup path making the ttwu_queue
- * optimization less meaningful. Skip if on SCX.
- */
- if (task_on_scx(p))
+ /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
+ if (!scx_allow_ttwu_queue(p))
return false;
/*
@@ -4196,7 +4194,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
- SCHED_WARN_ON(p->se.sched_delayed);
+ WARN_ON_ONCE(p->se.sched_delayed);
if (!ttwu_state_match(p, state, &success))
goto out;
@@ -4490,7 +4488,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_LIST_HEAD(&p->se.group_node);
/* A delayed task cannot be in clone(). */
- SCHED_WARN_ON(p->se.sched_delayed);
+ WARN_ON_ONCE(p->se.sched_delayed);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
@@ -5307,6 +5305,12 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
*/
finish_task_switch(prev);
+ /*
+ * This is a special case: the newly created task has just
+ * switched the context for the first time. It is returning from
+ * schedule for the first time in this path.
+ */
+ trace_sched_exit_tp(true, CALLER_ADDR0);
preempt_enable();
if (current->set_child_tid)
@@ -5578,7 +5582,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
-#ifdef CONFIG_SCHED_DEBUG
static u64 cpu_resched_latency(struct rq *rq)
{
int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
@@ -5623,9 +5626,6 @@ static int __init setup_resched_latency_warn_ms(char *str)
return 1;
}
__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
-#else
-static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
-#endif /* CONFIG_SCHED_DEBUG */
/*
* This function gets called by the timer code, with HZ frequency.
@@ -5746,7 +5746,7 @@ static void sched_tick_remote(struct work_struct *work)
* we are always sure that there is no proxy (only a
* single task is running).
*/
- SCHED_WARN_ON(rq->curr != rq->donor);
+ WARN_ON_ONCE(rq->curr != rq->donor);
update_rq_clock(rq);
if (!is_idle_task(curr)) {
@@ -5966,7 +5966,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
- SCHED_WARN_ON(ct_state() == CT_STATE_USER);
+ WARN_ON_ONCE(ct_state() == CT_STATE_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -6650,12 +6650,15 @@ static void __sched notrace __schedule(int sched_mode)
* as a preemption by schedule_debug() and RCU.
*/
bool preempt = sched_mode > SM_NONE;
+ bool is_switch = false;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
struct rq *rq;
int cpu;
+ trace_sched_entry_tp(preempt, CALLER_ADDR0);
+
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
@@ -6719,11 +6722,10 @@ static void __sched notrace __schedule(int sched_mode)
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
-#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
-#endif
- if (likely(prev != next)) {
+ is_switch = prev != next;
+ if (likely(is_switch)) {
rq->nr_switches++;
/*
* RCU users of rcu_dereference(rq->curr) may not see
@@ -6768,6 +6770,7 @@ picked:
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
}
+ trace_sched_exit_tp(is_switch, CALLER_ADDR0);
}
void __noreturn do_task_dead(void)
@@ -6812,7 +6815,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
* deadlock if the callback attempts to acquire a lock which is
* already acquired.
*/
- SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
+ WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT);
/*
* If we are going to sleep and we have plugged IO queued,
@@ -7095,7 +7098,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
+ WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
@@ -7290,7 +7293,7 @@ int __sched __cond_resched(void)
return 1;
}
/*
- * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
+ * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick
* whether the current CPU is in an RCU read-side critical section,
* so the tick can report quiescent states even for CPUs looping
* in kernel context. In contrast, in non-preemptible kernels,
@@ -7299,6 +7302,8 @@ int __sched __cond_resched(void)
* RCU quiescent state. Therefore, the following code causes
* cond_resched() to report a quiescent state, but only when RCU
* is in urgent need of one.
+ * A third case, preemptible, but non-PREEMPT_RCU provides for
+ * urgently needed quiescent states via rcu_flavor_sched_clock_irq().
*/
#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
@@ -7647,10 +7652,57 @@ PREEMPT_MODEL_ACCESSOR(lazy);
#else /* !CONFIG_PREEMPT_DYNAMIC: */
+#define preempt_dynamic_mode -1
+
static inline void preempt_dynamic_init(void) { }
#endif /* CONFIG_PREEMPT_DYNAMIC */
+const char *preempt_modes[] = {
+ "none", "voluntary", "full", "lazy", NULL,
+};
+
+const char *preempt_model_str(void)
+{
+ bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) ||
+ IS_ENABLED(CONFIG_PREEMPT_LAZY));
+ static char buf[128];
+
+ if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) {
+ struct seq_buf s;
+
+ seq_buf_init(&s, buf, sizeof(buf));
+ seq_buf_puts(&s, "PREEMPT");
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ seq_buf_printf(&s, "%sRT%s",
+ brace ? "_{" : "_",
+ brace ? "," : "");
+
+ if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) {
+ seq_buf_printf(&s, "(%s)%s",
+ preempt_dynamic_mode > 0 ?
+ preempt_modes[preempt_dynamic_mode] : "undef",
+ brace ? "}" : "");
+ return seq_buf_str(&s);
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+ seq_buf_printf(&s, "LAZY%s",
+ brace ? "}" : "");
+ return seq_buf_str(&s);
+ }
+
+ return seq_buf_str(&s);
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD))
+ return "VOLUNTARY";
+
+ return "NONE";
+}
+
int io_schedule_prepare(void)
{
int old_iowait = current->in_iowait;
@@ -7765,10 +7817,9 @@ void show_state_filter(unsigned int state_filter)
sched_show_task(p);
}
-#ifdef CONFIG_SCHED_DEBUG
if (!state_filter)
sysrq_sched_debug_show();
-#endif
+
rcu_read_unlock();
/*
* Only show locks if all tasks are dumped:
@@ -8183,7 +8234,7 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- partition_sched_domains(1, NULL, NULL);
+ cpuset_reset_sched_domains();
if (--num_cpus_frozen)
return;
/*
@@ -8202,7 +8253,7 @@ static void cpuset_cpu_inactive(unsigned int cpu)
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
- partition_sched_domains(1, NULL, NULL);
+ cpuset_reset_sched_domains();
}
}
@@ -8424,9 +8475,9 @@ void __init sched_init_smp(void)
* CPU masks are stable and all blatant races in the below code cannot
* happen.
*/
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
sched_init_domains(cpu_active_mask);
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
@@ -9016,7 +9067,7 @@ void sched_release_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static struct task_group *sched_get_task_group(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -9028,13 +9079,7 @@ static struct task_group *sched_get_task_group(struct task_struct *tsk)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
-
- return tg;
-}
-
-static void sched_change_group(struct task_struct *tsk, struct task_group *group)
-{
- tsk->sched_task_group = group;
+ tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
@@ -9055,20 +9100,11 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- struct task_group *group;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
rq = rq_guard.rq;
- /*
- * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
- * group changes.
- */
- group = sched_get_task_group(tsk);
- if (group == tsk->sched_task_group)
- return;
-
update_rq_clock(rq);
running = task_current_donor(rq, tsk);
@@ -9079,7 +9115,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk, group);
+ sched_change_group(tsk);
if (!for_autogroup)
scx_cgroup_move_task(tsk);
@@ -9203,7 +9239,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
unsigned int clamps;
lockdep_assert_held(&uclamp_mutex);
- SCHED_WARN_ON(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held());
css_for_each_descendant_pre(css, top_css) {
uc_parent = css_tg(css)->parent
@@ -9295,7 +9331,7 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
if (req.ret)
return req.ret;
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
guard(mutex)(&uclamp_mutex);
guard(rcu)();
@@ -10538,7 +10574,7 @@ static void task_mm_cid_work(struct callback_head *work)
struct mm_struct *mm;
int weight, cpu;
- SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
+ WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
work->next = work; /* Prevent double-add */
if (t->flags & PF_EXITING)
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 1ef98a93eb1d..c4606ca89210 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -65,7 +65,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
* a cookie until after we've removed it, we must have core scheduling
* enabled here.
*/
- SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
+ WARN_ON_ONCE((p->core_cookie || cookie) && !sched_core_enabled(rq));
if (sched_core_enqueued(p))
sched_core_dequeue(rq, p, DEQUEUE_SAVE);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ff4df16b5186..ad45a8fea245 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -166,14 +166,14 @@ static inline unsigned long dl_bw_capacity(int i)
}
}
-static inline bool dl_bw_visited(int cpu, u64 gen)
+bool dl_bw_visited(int cpu, u64 cookie)
{
struct root_domain *rd = cpu_rq(cpu)->rd;
- if (rd->visit_gen == gen)
+ if (rd->visit_cookie == cookie)
return true;
- rd->visit_gen = gen;
+ rd->visit_cookie = cookie;
return false;
}
@@ -207,7 +207,7 @@ static inline unsigned long dl_bw_capacity(int i)
return SCHED_CAPACITY_SCALE;
}
-static inline bool dl_bw_visited(int cpu, u64 gen)
+bool dl_bw_visited(int cpu, u64 cookie)
{
return false;
}
@@ -249,8 +249,8 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw += dl_bw;
- SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
- SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+ WARN_ON_ONCE(dl_rq->running_bw < old); /* overflow */
+ WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw);
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
}
@@ -262,7 +262,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw -= dl_bw;
- SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
+ WARN_ON_ONCE(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
dl_rq->running_bw = 0;
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
@@ -276,7 +276,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw += dl_bw;
- SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
+ WARN_ON_ONCE(dl_rq->this_bw < old); /* overflow */
}
static inline
@@ -286,10 +286,10 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw -= dl_bw;
- SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
+ WARN_ON_ONCE(dl_rq->this_bw > old); /* underflow */
if (dl_rq->this_bw > old)
dl_rq->this_bw = 0;
- SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+ WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw);
}
static inline
@@ -1382,8 +1382,7 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- timer->function = dl_task_timer;
+ hrtimer_setup(timer, dl_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
}
/*
@@ -1839,8 +1838,7 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->inactive_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- timer->function = inactive_task_timer;
+ hrtimer_setup(timer, inactive_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
}
#define __node_2_dle(node) \
@@ -2956,7 +2954,7 @@ void dl_add_task_root_domain(struct task_struct *p)
struct dl_bw *dl_b;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- if (!dl_task(p)) {
+ if (!dl_task(p) || dl_entity_is_special(&p->dl)) {
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return;
}
@@ -2981,18 +2979,22 @@ void dl_clear_root_domain(struct root_domain *rd)
rd->dl_bw.total_bw = 0;
/*
- * dl_server bandwidth is only restored when CPUs are attached to root
- * domains (after domains are created or CPUs moved back to the
- * default root doamin).
+ * dl_servers are not tasks. Since dl_add_task_root_domain ignores
+ * them, we need to account for them here explicitly.
*/
for_each_cpu(i, rd->span) {
struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
if (dl_server(dl_se) && cpu_active(i))
- rd->dl_bw.total_bw += dl_se->dl_bw;
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i));
}
}
+void dl_clear_root_domain_cpu(int cpu)
+{
+ dl_clear_root_domain(cpu_rq(cpu)->rd);
+}
+
#endif /* CONFIG_SMP */
static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@ -3171,15 +3173,18 @@ DEFINE_SCHED_CLASS(dl) = {
#endif
};
-/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
-static u64 dl_generation;
+/*
+ * Used for dl_bw check and update, used under sched_rt_handler()::mutex and
+ * sched_domains_mutex.
+ */
+u64 dl_cookie;
int sched_dl_global_validate(void)
{
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
- u64 gen = ++dl_generation;
+ u64 cookie = ++dl_cookie;
struct dl_bw *dl_b;
int cpu, cpus, ret = 0;
unsigned long flags;
@@ -3192,7 +3197,7 @@ int sched_dl_global_validate(void)
for_each_online_cpu(cpu) {
rcu_read_lock_sched();
- if (dl_bw_visited(cpu, gen))
+ if (dl_bw_visited(cpu, cookie))
goto next;
dl_b = dl_bw_of(cpu);
@@ -3229,7 +3234,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
void sched_dl_do_global(void)
{
u64 new_bw = -1;
- u64 gen = ++dl_generation;
+ u64 cookie = ++dl_cookie;
struct dl_bw *dl_b;
int cpu;
unsigned long flags;
@@ -3240,7 +3245,7 @@ void sched_dl_do_global(void)
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
- if (dl_bw_visited(cpu, gen)) {
+ if (dl_bw_visited(cpu, cookie)) {
rcu_read_unlock_sched();
continue;
}
@@ -3567,9 +3572,7 @@ void dl_bw_free(int cpu, u64 dl_bw)
}
#endif
-#ifdef CONFIG_SCHED_DEBUG
void print_dl_stats(struct seq_file *m, int cpu)
{
print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
}
-#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ef047add7f9e..56ae54e0ce6a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -244,11 +244,13 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
static int sched_dynamic_show(struct seq_file *m, void *v)
{
- static const char * preempt_modes[] = {
- "none", "voluntary", "full", "lazy",
- };
- int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
+ int j;
+
+ /* Count entries in NULL terminated preempt_modes */
+ for (j = 0; preempt_modes[j]; j++)
+ ;
+ j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
for (; i < j; i++) {
if (preempt_dynamic_mode == i)
@@ -292,7 +294,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
bool orig;
cpus_read_lock();
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
orig = sched_debug_verbose;
result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
@@ -304,7 +306,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
sd_dentry = NULL;
}
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
cpus_read_unlock();
return result;
@@ -515,9 +517,9 @@ static __init int sched_init_debug(void)
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
update_sched_domain_debugfs();
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
#endif
#ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7b9dfee858e7..21575d39c376 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6,6 +6,9 @@
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
+#include <linux/btf_ids.h>
+#include "ext_idle.h"
+
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
enum scx_consts {
@@ -93,7 +96,7 @@ enum scx_ops_flags {
/*
* Keep built-in idle tracking even if ops.update_idle() is implemented.
*/
- SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
/*
* By default, if there are no other task to run on the CPU, ext core
@@ -101,7 +104,7 @@ enum scx_ops_flags {
* flag is specified, such tasks are passed to ops.enqueue() with
* %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
*/
- SCX_OPS_ENQ_LAST = 1LLU << 1,
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
/*
* An exiting task may schedule after PF_EXITING is set. In such cases,
@@ -114,13 +117,13 @@ enum scx_ops_flags {
* depend on pid lookups and wants to handle these tasks directly, the
* following flag can be used.
*/
- SCX_OPS_ENQ_EXITING = 1LLU << 2,
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
/*
* If set, only tasks with policy set to SCHED_EXT are attached to
* sched_ext. If clear, SCHED_NORMAL tasks are also included.
*/
- SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
/*
* A migration disabled task can only execute on its current CPU. By
@@ -133,7 +136,29 @@ enum scx_ops_flags {
* current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
* and thus may disagree with cpumask_weight(p->cpus_ptr).
*/
- SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
+ * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
+ * ops.enqueue() on the ops.select_cpu() selected or the wakee's
+ * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+ * transfers. When this optimization is enabled, ops.select_cpu() is
+ * skipped in some cases (when racing against the wakee switching out).
+ * As the BPF scheduler may depend on ops.select_cpu() being invoked
+ * during wakeups, queued wakeup is disabled by default.
+ *
+ * If this ops flag is set, queued wakeup optimization is enabled and
+ * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+ * wakee's CPU without preceding ops.select_cpu() even for tasks which
+ * may be executed on multiple CPUs.
+ */
+ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
+
+ /*
+ * If set, enable per-node idle cpumasks. If clear, use a single global
+ * flat idle cpumask.
+ */
+ SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
/*
* CPU cgroup support flags
@@ -144,7 +169,9 @@ enum scx_ops_flags {
SCX_OPS_ENQ_LAST |
SCX_OPS_ENQ_EXITING |
SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
SCX_OPS_HAS_CGROUP_WEIGHT,
};
@@ -779,6 +806,7 @@ enum scx_deq_flags {
enum scx_pick_idle_cpu_flags {
SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
+ SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
};
enum scx_kick_flags {
@@ -894,16 +922,11 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
static struct sched_ext_ops scx_ops;
static bool scx_warned_zero_slice;
+DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
-
-#ifdef CONFIG_SMP
-static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
-static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
-#endif
static struct static_key_false scx_has_op[SCX_OPI_END] =
{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
@@ -938,21 +961,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
static struct delayed_work scx_watchdog_work;
-/* idle tracking */
-#ifdef CONFIG_SMP
-#ifdef CONFIG_CPUMASK_OFFSTACK
-#define CL_ALIGNED_IF_ONSTACK
-#else
-#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
-#endif
-
-static struct {
- cpumask_var_t cpu;
- cpumask_var_t smt;
-} idle_masks CL_ALIGNED_IF_ONSTACK;
-
-#endif /* CONFIG_SMP */
-
/* for %SCX_KICK_WAIT */
static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
@@ -1473,6 +1481,117 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return p;
}
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * The total number of tasks enqueued (or pick_task-ed) with a
+ * default time slice (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_ENQ_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+/*
+ * The event counter is organized by a per-CPU variable to minimize the
+ * accounting overhead without synchronization. A system-wide view on the
+ * event counter is constructed when requested by scx_bpf_get_event_stat().
+ */
+static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
+
+/**
+ * scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occured
+ *
+ * This can be used when preemption is not disabled.
+ */
+#define scx_add_event(name, cnt) do { \
+ this_cpu_add(event_stats_cpu.name, cnt); \
+ trace_sched_ext_event(#name, cnt); \
+} while(0)
+
+/**
+ * __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occured
+ *
+ * This should be used only when preemption is disabled.
+ */
+#define __scx_add_event(name, cnt) do { \
+ __this_cpu_add(event_stats_cpu.name, cnt); \
+ trace_sched_ext_event(#name, cnt); \
+} while(0)
+
+/**
+ * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e'
+ * @dst_e: destination event stats
+ * @src_e: source event stats
+ * @kind: a kind of event to be aggregated
+ */
+#define scx_agg_event(dst_e, src_e, kind) do { \
+ (dst_e)->kind += READ_ONCE((src_e)->kind); \
+} while(0)
+
+/**
+ * scx_dump_event - Dump an event 'kind' in 'events' to 's'
+ * @s: output seq_buf
+ * @events: event stats
+ * @kind: a kind of event to dump
+ */
+#define scx_dump_event(s, events, kind) do { \
+ dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \
+} while (0)
+
+
+static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz);
+
static enum scx_ops_enable_state scx_ops_enable_state(void)
{
return atomic_read(&scx_ops_enable_state_var);
@@ -2018,21 +2137,27 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
if (!scx_rq_online(rq))
goto local;
- if (scx_rq_bypassing(rq))
+ if (scx_rq_bypassing(rq)) {
+ __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
goto global;
+ }
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
goto direct;
/* see %SCX_OPS_ENQ_EXITING */
if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
- unlikely(p->flags & PF_EXITING))
+ unlikely(p->flags & PF_EXITING)) {
+ __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
goto local;
+ }
/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
- is_migration_disabled(p))
+ is_migration_disabled(p)) {
+ __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
goto local;
+ }
if (!SCX_HAS_OP(enqueue))
goto global;
@@ -2072,6 +2197,7 @@ local:
*/
touch_core_sched(rq, p);
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
local_norefill:
dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
return;
@@ -2079,6 +2205,7 @@ local_norefill:
global:
touch_core_sched(rq, p); /* see the comment in local: */
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
dispatch_enqueue(find_global_dsq(p), p, enq_flags);
}
@@ -2150,6 +2277,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
+
+ if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
+ unlikely(cpu_of(rq) != p->scx.selected_cpu))
+ __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
}
static void ops_dequeue(struct task_struct *p, u64 deq_flags)
@@ -2337,11 +2468,11 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
* The caller must ensure that @p and @rq are on different CPUs.
*/
static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
- bool trigger_error)
+ bool enforce)
{
int cpu = cpu_of(rq);
- SCHED_WARN_ON(task_cpu(p) == cpu);
+ WARN_ON_ONCE(task_cpu(p) == cpu);
/*
* If @p has migration disabled, @p->cpus_ptr is updated to contain only
@@ -2356,7 +2487,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
* easily be masked if task_allowed_on_cpu() is done first.
*/
if (unlikely(is_migration_disabled(p))) {
- if (trigger_error)
+ if (enforce)
scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
p->comm, p->pid, task_cpu(p), cpu);
return false;
@@ -2369,14 +2500,17 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
* picked CPU is outside the allowed mask.
*/
if (!task_allowed_on_cpu(p, cpu)) {
- if (trigger_error)
+ if (enforce)
scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
cpu, p->comm, p->pid);
return false;
}
- if (!scx_rq_online(rq))
+ if (!scx_rq_online(rq)) {
+ if (enforce)
+ __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
+ }
return true;
}
@@ -2446,7 +2580,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
}
#else /* CONFIG_SMP */
static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
-static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; }
+static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; }
static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
#endif /* CONFIG_SMP */
@@ -2893,6 +3027,7 @@ no_tasks:
if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
goto has_tasks;
}
rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@@ -3159,8 +3294,10 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*/
if (keep_prev) {
p = prev;
- if (!p->scx.slice)
+ if (!p->scx.slice) {
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ }
} else {
p = first_local_task(rq);
if (!p) {
@@ -3176,6 +3313,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
scx_warned_zero_slice = true;
}
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
}
}
@@ -3220,418 +3358,10 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
#ifdef CONFIG_SMP
-static bool test_and_clear_cpu_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
- /*
- * SMT mask should be cleared whether we can claim @cpu or not. The SMT
- * cluster is not wholly idle either way. This also prevents
- * scx_pick_idle_cpu() from getting caught in an infinite loop.
- */
- if (sched_smt_active()) {
- const struct cpumask *smt = cpu_smt_mask(cpu);
-
- /*
- * If offline, @cpu is not its own sibling and
- * scx_pick_idle_cpu() can get caught in an infinite loop as
- * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
- * is eventually cleared.
- *
- * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
- * reduce memory writes, which may help alleviate cache
- * coherence pressure.
- */
- if (cpumask_intersects(smt, idle_masks.smt))
- cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
- else if (cpumask_test_cpu(cpu, idle_masks.smt))
- __cpumask_clear_cpu(cpu, idle_masks.smt);
- }
-#endif
- return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
-}
-
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
-{
- int cpu;
-
-retry:
- if (sched_smt_active()) {
- cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
- if (cpu < nr_cpu_ids)
- goto found;
-
- if (flags & SCX_PICK_IDLE_CORE)
- return -EBUSY;
- }
-
- cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
- if (cpu >= nr_cpu_ids)
- return -EBUSY;
-
-found:
- if (test_and_clear_cpu_idle(cpu))
- return cpu;
- else
- goto retry;
-}
-
-/*
- * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
- * domain is not defined).
- */
-static unsigned int llc_weight(s32 cpu)
-{
- struct sched_domain *sd;
-
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (!sd)
- return 0;
-
- return sd->span_weight;
-}
-
-/*
- * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
- * domain is not defined).
- */
-static struct cpumask *llc_span(s32 cpu)
-{
- struct sched_domain *sd;
-
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (!sd)
- return 0;
-
- return sched_domain_span(sd);
-}
-
-/*
- * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
- * NUMA domain is not defined).
- */
-static unsigned int numa_weight(s32 cpu)
-{
- struct sched_domain *sd;
- struct sched_group *sg;
-
- sd = rcu_dereference(per_cpu(sd_numa, cpu));
- if (!sd)
- return 0;
- sg = sd->groups;
- if (!sg)
- return 0;
-
- return sg->group_weight;
-}
-
-/*
- * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
- * domain is not defined).
- */
-static struct cpumask *numa_span(s32 cpu)
-{
- struct sched_domain *sd;
- struct sched_group *sg;
-
- sd = rcu_dereference(per_cpu(sd_numa, cpu));
- if (!sd)
- return NULL;
- sg = sd->groups;
- if (!sg)
- return NULL;
-
- return sched_group_span(sg);
-}
-
-/*
- * Return true if the LLC domains do not perfectly overlap with the NUMA
- * domains, false otherwise.
- */
-static bool llc_numa_mismatch(void)
-{
- int cpu;
-
- /*
- * We need to scan all online CPUs to verify whether their scheduling
- * domains overlap.
- *
- * While it is rare to encounter architectures with asymmetric NUMA
- * topologies, CPU hotplugging or virtualized environments can result
- * in asymmetric configurations.
- *
- * For example:
- *
- * NUMA 0:
- * - LLC 0: cpu0..cpu7
- * - LLC 1: cpu8..cpu15 [offline]
- *
- * NUMA 1:
- * - LLC 0: cpu16..cpu23
- * - LLC 1: cpu24..cpu31
- *
- * In this case, if we only check the first online CPU (cpu0), we might
- * incorrectly assume that the LLC and NUMA domains are fully
- * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
- * domains).
- */
- for_each_online_cpu(cpu)
- if (llc_weight(cpu) != numa_weight(cpu))
- return true;
-
- return false;
-}
-
-/*
- * Initialize topology-aware scheduling.
- *
- * Detect if the system has multiple LLC or multiple NUMA domains and enable
- * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
- * selection policy.
- *
- * Assumption: the kernel's internal topology representation assumes that each
- * CPU belongs to a single LLC domain, and that each LLC domain is entirely
- * contained within a single NUMA node.
- */
-static void update_selcpu_topology(void)
-{
- bool enable_llc = false, enable_numa = false;
- unsigned int nr_cpus;
- s32 cpu = cpumask_first(cpu_online_mask);
-
- /*
- * Enable LLC domain optimization only when there are multiple LLC
- * domains among the online CPUs. If all online CPUs are part of a
- * single LLC domain, the idle CPU selection logic can choose any
- * online CPU without bias.
- *
- * Note that it is sufficient to check the LLC domain of the first
- * online CPU to determine whether a single LLC domain includes all
- * CPUs.
- */
- rcu_read_lock();
- nr_cpus = llc_weight(cpu);
- if (nr_cpus > 0) {
- if (nr_cpus < num_online_cpus())
- enable_llc = true;
- pr_debug("sched_ext: LLC=%*pb weight=%u\n",
- cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
- }
-
- /*
- * Enable NUMA optimization only when there are multiple NUMA domains
- * among the online CPUs and the NUMA domains don't perfectly overlaps
- * with the LLC domains.
- *
- * If all CPUs belong to the same NUMA node and the same LLC domain,
- * enabling both NUMA and LLC optimizations is unnecessary, as checking
- * for an idle CPU in the same domain twice is redundant.
- */
- nr_cpus = numa_weight(cpu);
- if (nr_cpus > 0) {
- if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
- enable_numa = true;
- pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
- cpumask_pr_args(numa_span(cpu)), numa_weight(cpu));
- }
- rcu_read_unlock();
-
- pr_debug("sched_ext: LLC idle selection %s\n",
- str_enabled_disabled(enable_llc));
- pr_debug("sched_ext: NUMA idle selection %s\n",
- str_enabled_disabled(enable_numa));
-
- if (enable_llc)
- static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
- else
- static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
- if (enable_numa)
- static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
- else
- static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
-}
-
-/*
- * Built-in CPU idle selection policy:
- *
- * 1. Prioritize full-idle cores:
- * - always prioritize CPUs from fully idle cores (both logical CPUs are
- * idle) to avoid interference caused by SMT.
- *
- * 2. Reuse the same CPU:
- * - prefer the last used CPU to take advantage of cached data (L1, L2) and
- * branch prediction optimizations.
- *
- * 3. Pick a CPU within the same LLC (Last-Level Cache):
- * - if the above conditions aren't met, pick a CPU that shares the same LLC
- * to maintain cache locality.
- *
- * 4. Pick a CPU within the same NUMA node, if enabled:
- * - choose a CPU from the same NUMA node to reduce memory access latency.
- *
- * 5. Pick any idle CPU usable by the task.
- *
- * Step 3 and 4 are performed only if the system has, respectively, multiple
- * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
- * scx_selcpu_topo_numa).
- *
- * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
- * we never call ops.select_cpu() for them, see select_task_rq().
- */
-static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
- u64 wake_flags, bool *found)
-{
- const struct cpumask *llc_cpus = NULL;
- const struct cpumask *numa_cpus = NULL;
- s32 cpu;
-
- *found = false;
-
- /*
- * This is necessary to protect llc_cpus.
- */
- rcu_read_lock();
-
- /*
- * Determine the scheduling domain only if the task is allowed to run
- * on all CPUs.
- *
- * This is done primarily for efficiency, as it avoids the overhead of
- * updating a cpumask every time we need to select an idle CPU (which
- * can be costly in large SMP systems), but it also aligns logically:
- * if a task's scheduling domain is restricted by user-space (through
- * CPU affinity), the task will simply use the flat scheduling domain
- * defined by user-space.
- */
- if (p->nr_cpus_allowed >= num_possible_cpus()) {
- if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = numa_span(prev_cpu);
-
- if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
- llc_cpus = llc_span(prev_cpu);
- }
-
- /*
- * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
- */
- if (wake_flags & SCX_WAKE_SYNC) {
- cpu = smp_processor_id();
-
- /*
- * If the waker's CPU is cache affine and prev_cpu is idle,
- * then avoid a migration.
- */
- if (cpus_share_cache(cpu, prev_cpu) &&
- test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- }
-
- /*
- * If the waker's local DSQ is empty, and the system is under
- * utilized, try to wake up @p to the local DSQ of the waker.
- *
- * Checking only for an empty local DSQ is insufficient as it
- * could give the wakee an unfair advantage when the system is
- * oversaturated.
- *
- * Checking only for the presence of idle CPUs is also
- * insufficient as the local DSQ of the waker could have tasks
- * piled up on it even if there is an idle core elsewhere on
- * the system.
- */
- if (!cpumask_empty(idle_masks.cpu) &&
- !(current->flags & PF_EXITING) &&
- cpu_rq(cpu)->scx.local_dsq.nr == 0) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
- goto cpu_found;
- }
- }
-
- /*
- * If CPU has SMT, any wholly idle CPU is likely a better pick than
- * partially idle @prev_cpu.
- */
- if (sched_smt_active()) {
- /*
- * Keep using @prev_cpu if it's part of a fully idle core.
- */
- if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
- test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- }
-
- /*
- * Search for any fully idle core in the same LLC domain.
- */
- if (llc_cpus) {
- cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any fully idle core in the same NUMA node.
- */
- if (numa_cpus) {
- cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any full idle core usable by the task.
- */
- cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Use @prev_cpu if it's idle.
- */
- if (test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- }
-
- /*
- * Search for any idle CPU in the same LLC domain.
- */
- if (llc_cpus) {
- cpu = scx_pick_idle_cpu(llc_cpus, 0);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any idle CPU in the same NUMA node.
- */
- if (numa_cpus) {
- cpu = scx_pick_idle_cpu(numa_cpus, 0);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any idle CPU usable by the task.
- */
- cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
- if (cpu >= 0)
- goto cpu_found;
-
- rcu_read_unlock();
- return prev_cpu;
-
-cpu_found:
- rcu_read_unlock();
-
- *found = true;
- return cpu;
-}
-
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
+ bool rq_bypass;
+
/*
* sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
* can be a good migration opportunity with low cache and memory
@@ -3645,7 +3375,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
if (unlikely(wake_flags & WF_EXEC))
return prev_cpu;
- if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {
+ rq_bypass = scx_rq_bypassing(task_rq(p));
+ if (SCX_HAS_OP(select_cpu) && !rq_bypass) {
s32 cpu;
struct task_struct **ddsp_taskp;
@@ -3655,20 +3386,27 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
select_cpu, p, prev_cpu, wake_flags);
+ p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
return cpu;
else
return prev_cpu;
} else {
- bool found;
s32 cpu;
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
- if (found) {
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ if (cpu >= 0) {
p->scx.slice = SCX_SLICE_DFL;
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ } else {
+ cpu = prev_cpu;
}
+ p->scx.selected_cpu = cpu;
+
+ if (rq_bypass)
+ __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
return cpu;
}
}
@@ -3696,90 +3434,6 @@ static void set_cpus_allowed_scx(struct task_struct *p,
(struct cpumask *)p->cpus_ptr);
}
-static void reset_idle_masks(void)
-{
- /*
- * Consider all online cpus idle. Should converge to the actual state
- * quickly.
- */
- cpumask_copy(idle_masks.cpu, cpu_online_mask);
- cpumask_copy(idle_masks.smt, cpu_online_mask);
-}
-
-static void update_builtin_idle(int cpu, bool idle)
-{
- assign_cpu(cpu, idle_masks.cpu, idle);
-
-#ifdef CONFIG_SCHED_SMT
- if (sched_smt_active()) {
- const struct cpumask *smt = cpu_smt_mask(cpu);
-
- if (idle) {
- /*
- * idle_masks.smt handling is racy but that's fine as
- * it's only for optimization and self-correcting.
- */
- if (!cpumask_subset(smt, idle_masks.cpu))
- return;
- cpumask_or(idle_masks.smt, idle_masks.smt, smt);
- } else {
- cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
- }
- }
-#endif
-}
-
-/*
- * Update the idle state of a CPU to @idle.
- *
- * If @do_notify is true, ops.update_idle() is invoked to notify the scx
- * scheduler of an actual idle state transition (idle to busy or vice
- * versa). If @do_notify is false, only the idle state in the idle masks is
- * refreshed without invoking ops.update_idle().
- *
- * This distinction is necessary, because an idle CPU can be "reserved" and
- * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
- * busy even if no tasks are dispatched. In this case, the CPU may return
- * to idle without a true state transition. Refreshing the idle masks
- * without invoking ops.update_idle() ensures accurate idle state tracking
- * while avoiding unnecessary updates and maintaining balanced state
- * transitions.
- */
-void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
-{
- int cpu = cpu_of(rq);
-
- lockdep_assert_rq_held(rq);
-
- /*
- * Trigger ops.update_idle() only when transitioning from a task to
- * the idle thread and vice versa.
- *
- * Idle transitions are indicated by do_notify being set to true,
- * managed by put_prev_task_idle()/set_next_task_idle().
- */
- if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
-
- /*
- * Update the idle masks:
- * - for real idle transitions (do_notify == true)
- * - for idle-to-idle transitions (indicated by the previous task
- * being the idle thread, managed by pick_task_idle())
- *
- * Skip updating idle masks if the previous task is not the idle
- * thread, since set_next_task_idle() has already handled it when
- * transitioning from a task to the idle thread (calling this
- * function with do_notify == true).
- *
- * In this way we can avoid updating the idle masks twice,
- * unnecessarily.
- */
- if (static_branch_likely(&scx_builtin_idle_enabled))
- if (do_notify || is_idle_task(rq->curr))
- update_builtin_idle(cpu, idle);
-}
-
static void handle_hotplug(struct rq *rq, bool online)
{
int cpu = cpu_of(rq);
@@ -3787,7 +3441,7 @@ static void handle_hotplug(struct rq *rq, bool online)
atomic_long_inc(&scx_hotplug_seq);
if (scx_enabled())
- update_selcpu_topology();
+ scx_idle_update_selcpu_topology(&scx_ops);
if (online && SCX_HAS_OP(cpu_online))
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
@@ -3819,12 +3473,6 @@ static void rq_offline_scx(struct rq *rq)
rq->scx.flags &= ~SCX_RQ_ONLINE;
}
-#else /* CONFIG_SMP */
-
-static bool test_and_clear_cpu_idle(int cpu) { return false; }
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
-static void reset_idle_masks(void) {}
-
#endif /* CONFIG_SMP */
static bool check_rq_for_timeouts(struct rq *rq)
@@ -4749,8 +4397,33 @@ static ssize_t scx_attr_ops_show(struct kobject *kobj,
}
SCX_ATTR(ops);
+#define scx_attr_event_show(buf, at, events, kind) ({ \
+ sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \
+})
+
+static ssize_t scx_attr_events_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ struct scx_event_stats events;
+ int at = 0;
+
+ scx_bpf_events(&events, sizeof(events));
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
+ return at;
+}
+SCX_ATTR(events);
+
static struct attribute *scx_sched_attrs[] = {
&scx_attr_ops.attr,
+ &scx_attr_events.attr,
NULL,
};
ATTRIBUTE_GROUPS(scx_sched);
@@ -4862,6 +4535,8 @@ static void scx_clear_softlockup(void)
static void scx_ops_bypass(bool bypass)
{
static DEFINE_RAW_SPINLOCK(bypass_lock);
+ static unsigned long bypass_timestamp;
+
int cpu;
unsigned long flags;
@@ -4871,11 +4546,15 @@ static void scx_ops_bypass(bool bypass)
WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
if (scx_ops_bypass_depth != 1)
goto unlock;
+ bypass_timestamp = ktime_get_ns();
+ scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
} else {
scx_ops_bypass_depth--;
WARN_ON_ONCE(scx_ops_bypass_depth < 0);
if (scx_ops_bypass_depth != 0)
goto unlock;
+ scx_add_event(SCX_EV_BYPASS_DURATION,
+ ktime_get_ns() - bypass_timestamp);
}
atomic_inc(&scx_ops_breather_depth);
@@ -5095,11 +4774,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
static_branch_disable(&__scx_ops_enabled);
for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
static_branch_disable(&scx_has_op[i]);
+ static_branch_disable(&scx_ops_allow_queued_wakeup);
static_branch_disable(&scx_ops_enq_last);
static_branch_disable(&scx_ops_enq_exiting);
static_branch_disable(&scx_ops_enq_migration_disabled);
static_branch_disable(&scx_ops_cpu_preempt);
- static_branch_disable(&scx_builtin_idle_enabled);
+ scx_idle_disable();
synchronize_rcu();
if (ei->kind >= SCX_EXIT_ERROR) {
@@ -5349,6 +5029,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
.at_jiffies = jiffies,
};
struct seq_buf s;
+ struct scx_event_stats events;
unsigned long flags;
char *buf;
int cpu;
@@ -5457,6 +5138,21 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
rq_unlock(rq, &rf);
}
+ dump_newline(&s);
+ dump_line(&s, "Event counters");
+ dump_line(&s, "--------------");
+
+ scx_bpf_events(&events, sizeof(events));
+ scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
+
if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
memcpy(ei->dump + dump_len - sizeof(trunc_marker),
trunc_marker, sizeof(trunc_marker));
@@ -5546,6 +5242,16 @@ static int validate_ops(const struct sched_ext_ops *ops)
return -EINVAL;
}
+ /*
+ * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
+ * selection policy to be enabled.
+ */
+ if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
+ (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
+ scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -5564,6 +5270,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
mutex_lock(&scx_ops_enable_mutex);
+ /*
+ * Clear event counters so a new scx scheduler gets
+ * fresh event counter values.
+ */
+ for_each_possible_cpu(cpu) {
+ struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
+ memset(e, 0, sizeof(*e));
+ }
+
if (!scx_ops_helper) {
WRITE_ONCE(scx_ops_helper,
scx_create_rt_helper("sched_ext_ops_helper"));
@@ -5661,9 +5376,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable_cpuslocked(&scx_has_op[i]);
check_hotplug_seq(ops);
-#ifdef CONFIG_SMP
- update_selcpu_topology();
-#endif
+ scx_idle_update_selcpu_topology(ops);
+
cpus_read_unlock();
ret = validate_ops(ops);
@@ -5702,9 +5416,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (((void (**)(void))ops)[i])
static_branch_enable(&scx_has_op[i]);
+ if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
+ static_branch_enable(&scx_ops_allow_queued_wakeup);
if (ops->flags & SCX_OPS_ENQ_LAST)
static_branch_enable(&scx_ops_enq_last);
-
if (ops->flags & SCX_OPS_ENQ_EXITING)
static_branch_enable(&scx_ops_enq_exiting);
if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
@@ -5712,12 +5427,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
static_branch_enable(&scx_ops_cpu_preempt);
- if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
- reset_idle_masks();
- static_branch_enable(&scx_builtin_idle_enabled);
- } else {
- static_branch_disable(&scx_builtin_idle_enabled);
- }
+ scx_idle_enable(ops);
/*
* Lock out forks, cgroup on/offlining and moves before opening the
@@ -6356,10 +6066,8 @@ void __init init_sched_ext_class(void)
SCX_TG_ONLINE);
BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
-#ifdef CONFIG_SMP
- BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
- BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
-#endif
+ scx_idle_init_masks();
+
scx_kick_cpus_pnt_seqs =
__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
__alignof__(scx_kick_cpus_pnt_seqs[0]));
@@ -6367,15 +6075,16 @@ void __init init_sched_ext_class(void)
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
+ int n = cpu_to_node(cpu);
init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
INIT_LIST_HEAD(&rq->scx.runnable_list);
INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
@@ -6392,65 +6101,6 @@ void __init init_sched_ext_class(void)
/********************************************************************************
* Helpers that can be called from the BPF scheduler.
*/
-#include <linux/btf_ids.h>
-
-__bpf_kfunc_start_defs();
-
-static bool check_builtin_idle_enabled(void)
-{
- if (static_branch_likely(&scx_builtin_idle_enabled))
- return true;
-
- scx_ops_error("built-in idle tracking is disabled");
- return false;
-}
-
-/**
- * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
- * @p: task_struct to select a CPU for
- * @prev_cpu: CPU @p was on previously
- * @wake_flags: %SCX_WAKE_* flags
- * @is_idle: out parameter indicating whether the returned CPU is idle
- *
- * Can only be called from ops.select_cpu() if the built-in CPU selection is
- * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
- * @p, @prev_cpu and @wake_flags match ops.select_cpu().
- *
- * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
- * currently idle and thus a good candidate for direct dispatching.
- */
-__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
- u64 wake_flags, bool *is_idle)
-{
- if (!ops_cpu_valid(prev_cpu, NULL))
- goto prev_cpu;
-
- if (!check_builtin_idle_enabled())
- goto prev_cpu;
-
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
- goto prev_cpu;
-
-#ifdef CONFIG_SMP
- return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
-#endif
-
-prev_cpu:
- *is_idle = false;
- return prev_cpu;
-}
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
-BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
-
-static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
- .owner = THIS_MODULE,
- .set = &scx_kfunc_ids_select_cpu,
-};
-
static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
{
if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
@@ -6972,8 +6622,12 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
* CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
* the current local DSQ for running tasks and thus are not
* visible to the BPF scheduler.
+ *
+ * Also skip re-enqueueing tasks that can only run on this
+ * CPU, as they would just be re-added to the same local
+ * DSQ without any benefit.
*/
- if (p->migration_pending)
+ if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1)
continue;
dispatch_dequeue(rq, p);
@@ -7470,6 +7124,16 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
}
/**
+ * scx_bpf_nr_node_ids - Return the number of possible node IDs
+ *
+ * All valid node IDs in the system are smaller than the returned value.
+ */
+__bpf_kfunc u32 scx_bpf_nr_node_ids(void)
+{
+ return nr_node_ids;
+}
+
+/**
* scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
*
* All valid CPU IDs in the system are smaller than the returned value.
@@ -7510,142 +7174,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
}
/**
- * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
- * per-CPU cpumask.
- *
- * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
- */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
-{
- if (!check_builtin_idle_enabled())
- return cpu_none_mask;
-
-#ifdef CONFIG_SMP
- return idle_masks.cpu;
-#else
- return cpu_none_mask;
-#endif
-}
-
-/**
- * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
- * per-physical-core cpumask. Can be used to determine if an entire physical
- * core is free.
- *
- * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
- */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
-{
- if (!check_builtin_idle_enabled())
- return cpu_none_mask;
-
-#ifdef CONFIG_SMP
- if (sched_smt_active())
- return idle_masks.smt;
- else
- return idle_masks.cpu;
-#else
- return cpu_none_mask;
-#endif
-}
-
-/**
- * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
- * either the percpu, or SMT idle-tracking cpumask.
- * @idle_mask: &cpumask to use
- */
-__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
-{
- /*
- * Empty function body because we aren't actually acquiring or releasing
- * a reference to a global idle cpumask, which is read-only in the
- * caller and is never released. The acquire / release semantics here
- * are just used to make the cpumask a trusted pointer in the caller.
- */
-}
-
-/**
- * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
- * @cpu: cpu to test and clear idle for
- *
- * Returns %true if @cpu was idle and its idle state was successfully cleared.
- * %false otherwise.
- *
- * Unavailable if ops.update_idle() is implemented and
- * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
- */
-__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
-{
- if (!check_builtin_idle_enabled())
- return false;
-
- if (ops_cpu_valid(cpu, NULL))
- return test_and_clear_cpu_idle(cpu);
- else
- return false;
-}
-
-/**
- * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
- * @cpus_allowed: Allowed cpumask
- * @flags: %SCX_PICK_IDLE_CPU_* flags
- *
- * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
- * number on success. -%EBUSY if no matching cpu was found.
- *
- * Idle CPU tracking may race against CPU scheduling state transitions. For
- * example, this function may return -%EBUSY as CPUs are transitioning into the
- * idle state. If the caller then assumes that there will be dispatch events on
- * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
- * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
- * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
- * event in the near future.
- *
- * Unavailable if ops.update_idle() is implemented and
- * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
- */
-__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
- u64 flags)
-{
- if (!check_builtin_idle_enabled())
- return -EBUSY;
-
- return scx_pick_idle_cpu(cpus_allowed, flags);
-}
-
-/**
- * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
- * @cpus_allowed: Allowed cpumask
- * @flags: %SCX_PICK_IDLE_CPU_* flags
- *
- * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
- * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
- * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
- * empty.
- *
- * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
- * set, this function can't tell which CPUs are idle and will always pick any
- * CPU.
- */
-__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
- u64 flags)
-{
- s32 cpu;
-
- if (static_branch_likely(&scx_builtin_idle_enabled)) {
- cpu = scx_pick_idle_cpu(cpus_allowed, flags);
- if (cpu >= 0)
- return cpu;
- }
-
- cpu = cpumask_any_distribute(cpus_allowed);
- if (cpu < nr_cpu_ids)
- return cpu;
- else
- return -EBUSY;
-}
-
-/**
* scx_bpf_task_running - Is task currently running?
* @p: task of interest
*/
@@ -7765,6 +7293,43 @@ __bpf_kfunc u64 scx_bpf_now(void)
return clock;
}
+/*
+ * scx_bpf_events - Get a system-wide event counter to
+ * @events: output buffer from a BPF program
+ * @events__sz: @events len, must end in '__sz'' for the verifier
+ */
+__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
+ size_t events__sz)
+{
+ struct scx_event_stats e_sys, *e_cpu;
+ int cpu;
+
+ /* Aggregate per-CPU event counters into the system-wide counters. */
+ memset(&e_sys, 0, sizeof(e_sys));
+ for_each_possible_cpu(cpu) {
+ e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+ }
+
+ /*
+ * We cannot entirely trust a BPF-provided size since a BPF program
+ * might be compiled against a different vmlinux.h, of which
+ * scx_event_stats would be larger (a newer vmlinux.h) or smaller
+ * (an older vmlinux.h). Hence, we use the smaller size to avoid
+ * memory corruption.
+ */
+ events__sz = min(events__sz, sizeof(*events));
+ memcpy(events, &e_sys, events__sz);
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -7780,6 +7345,7 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
+BTF_ID_FLAGS(func, scx_bpf_nr_node_ids)
BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
@@ -7797,6 +7363,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
BTF_ID_FLAGS(func, scx_bpf_now)
+BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
@@ -7820,8 +7387,6 @@ static int __init scx_init(void)
* check using scx_kf_allowed().
*/
if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
- &scx_kfunc_set_select_cpu)) ||
- (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_enqueue_dispatch)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_dispatch)) ||
@@ -7841,6 +7406,12 @@ static int __init scx_init(void)
return ret;
}
+ ret = scx_idle_init();
+ if (ret) {
+ pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret);
+ return ret;
+ }
+
ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
if (ret) {
pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 1079b56b0f7a..1bda96b19a1b 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,6 +8,8 @@
*/
#ifdef CONFIG_SCHED_CLASS_EXT
+DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
+
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -34,6 +36,13 @@ static inline bool task_on_scx(const struct task_struct *p)
return scx_enabled() && p->sched_class == &ext_sched_class;
}
+static inline bool scx_allow_ttwu_queue(const struct task_struct *p)
+{
+ return !scx_enabled() ||
+ static_branch_likely(&scx_ops_allow_queued_wakeup) ||
+ p->sched_class != &ext_sched_class;
+}
+
#ifdef CONFIG_SCHED_CORE
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi);
@@ -52,6 +61,7 @@ static inline void scx_rq_activate(struct rq *rq) {}
static inline void scx_rq_deactivate(struct rq *rq) {}
static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
static inline bool task_on_scx(const struct task_struct *p) { return false; }
+static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; }
static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
new file mode 100644
index 000000000000..52c36a70a3d0
--- /dev/null
+++ b/kernel/sched/ext_idle.c
@@ -0,0 +1,1171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Built-in idle CPU tracking policy.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
+ */
+#include "ext_idle.h"
+
+/* Enable/disable built-in idle CPU selection policy */
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+
+/* Enable/disable per-node idle cpumasks */
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
+
+#ifdef CONFIG_SMP
+/* Enable/disable LLC aware optimizations */
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
+
+/* Enable/disable NUMA aware optimizations */
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
+
+/*
+ * cpumasks to track idle CPUs within each NUMA node.
+ *
+ * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask
+ * from is used to track all the idle CPUs in the system.
+ */
+struct scx_idle_cpus {
+ cpumask_var_t cpu;
+ cpumask_var_t smt;
+};
+
+/*
+ * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE
+ * is not enabled).
+ */
+static struct scx_idle_cpus scx_idle_global_masks;
+
+/*
+ * Per-node idle cpumasks.
+ */
+static struct scx_idle_cpus **scx_idle_node_masks;
+
+/*
+ * Return the idle masks associated to a target @node.
+ *
+ * NUMA_NO_NODE identifies the global idle cpumask.
+ */
+static struct scx_idle_cpus *idle_cpumask(int node)
+{
+ return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node];
+}
+
+/*
+ * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if
+ * per-node idle cpumasks are disabled.
+ */
+static int scx_cpu_node_if_enabled(int cpu)
+{
+ if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
+ return NUMA_NO_NODE;
+
+ return cpu_to_node(cpu);
+}
+
+bool scx_idle_test_and_clear_cpu(int cpu)
+{
+ int node = scx_cpu_node_if_enabled(cpu);
+ struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * SMT mask should be cleared whether we can claim @cpu or not. The SMT
+ * cluster is not wholly idle either way. This also prevents
+ * scx_pick_idle_cpu() from getting caught in an infinite loop.
+ */
+ if (sched_smt_active()) {
+ const struct cpumask *smt = cpu_smt_mask(cpu);
+ struct cpumask *idle_smts = idle_cpumask(node)->smt;
+
+ /*
+ * If offline, @cpu is not its own sibling and
+ * scx_pick_idle_cpu() can get caught in an infinite loop as
+ * @cpu is never cleared from the idle SMT mask. Ensure that
+ * @cpu is eventually cleared.
+ *
+ * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
+ * reduce memory writes, which may help alleviate cache
+ * coherence pressure.
+ */
+ if (cpumask_intersects(smt, idle_smts))
+ cpumask_andnot(idle_smts, idle_smts, smt);
+ else if (cpumask_test_cpu(cpu, idle_smts))
+ __cpumask_clear_cpu(cpu, idle_smts);
+ }
+#endif
+
+ return cpumask_test_and_clear_cpu(cpu, idle_cpus);
+}
+
+/*
+ * Pick an idle CPU in a specific NUMA node.
+ */
+static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ int cpu;
+
+retry:
+ if (sched_smt_active()) {
+ cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ goto found;
+
+ if (flags & SCX_PICK_IDLE_CORE)
+ return -EBUSY;
+ }
+
+ cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed);
+ if (cpu >= nr_cpu_ids)
+ return -EBUSY;
+
+found:
+ if (scx_idle_test_and_clear_cpu(cpu))
+ return cpu;
+ else
+ goto retry;
+}
+
+/*
+ * Tracks nodes that have not yet been visited when searching for an idle
+ * CPU across all available nodes.
+ */
+static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited);
+
+/*
+ * Search for an idle CPU across all nodes, excluding @node.
+ */
+static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ nodemask_t *unvisited;
+ s32 cpu = -EBUSY;
+
+ preempt_disable();
+ unvisited = this_cpu_ptr(&per_cpu_unvisited);
+
+ /*
+ * Restrict the search to the online nodes (excluding the current
+ * node that has been visited already).
+ */
+ nodes_copy(*unvisited, node_states[N_ONLINE]);
+ node_clear(node, *unvisited);
+
+ /*
+ * Traverse all nodes in order of increasing distance, starting
+ * from @node.
+ *
+ * This loop is O(N^2), with N being the amount of NUMA nodes,
+ * which might be quite expensive in large NUMA systems. However,
+ * this complexity comes into play only when a scheduler enables
+ * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU
+ * without specifying a target NUMA node, so it shouldn't be a
+ * bottleneck is most cases.
+ *
+ * As a future optimization we may want to cache the list of nodes
+ * in a per-node array, instead of actually traversing them every
+ * time.
+ */
+ for_each_node_numadist(node, *unvisited) {
+ cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags);
+ if (cpu >= 0)
+ break;
+ }
+ preempt_enable();
+
+ return cpu;
+}
+
+/*
+ * Find an idle CPU in the system, starting from @node.
+ */
+s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ s32 cpu;
+
+ /*
+ * Always search in the starting node first (this is an
+ * optimization that can save some cycles even when the search is
+ * not limited to a single node).
+ */
+ cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags);
+ if (cpu >= 0)
+ return cpu;
+
+ /*
+ * Stop the search if we are using only a single global cpumask
+ * (NUMA_NO_NODE) or if the search is restricted to the first node
+ * only.
+ */
+ if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE)
+ return -EBUSY;
+
+ /*
+ * Extend the search to the other online nodes.
+ */
+ return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags);
+}
+
+/*
+ * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
+ * domain is not defined).
+ */
+static unsigned int llc_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sd->span_weight;
+}
+
+/*
+ * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
+ * domain is not defined).
+ */
+static struct cpumask *llc_span(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sched_domain_span(sd);
+}
+
+/*
+ * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
+ * NUMA domain is not defined).
+ */
+static unsigned int numa_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return 0;
+ sg = sd->groups;
+ if (!sg)
+ return 0;
+
+ return sg->group_weight;
+}
+
+/*
+ * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
+ * domain is not defined).
+ */
+static struct cpumask *numa_span(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return NULL;
+ sg = sd->groups;
+ if (!sg)
+ return NULL;
+
+ return sched_group_span(sg);
+}
+
+/*
+ * Return true if the LLC domains do not perfectly overlap with the NUMA
+ * domains, false otherwise.
+ */
+static bool llc_numa_mismatch(void)
+{
+ int cpu;
+
+ /*
+ * We need to scan all online CPUs to verify whether their scheduling
+ * domains overlap.
+ *
+ * While it is rare to encounter architectures with asymmetric NUMA
+ * topologies, CPU hotplugging or virtualized environments can result
+ * in asymmetric configurations.
+ *
+ * For example:
+ *
+ * NUMA 0:
+ * - LLC 0: cpu0..cpu7
+ * - LLC 1: cpu8..cpu15 [offline]
+ *
+ * NUMA 1:
+ * - LLC 0: cpu16..cpu23
+ * - LLC 1: cpu24..cpu31
+ *
+ * In this case, if we only check the first online CPU (cpu0), we might
+ * incorrectly assume that the LLC and NUMA domains are fully
+ * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
+ * domains).
+ */
+ for_each_online_cpu(cpu)
+ if (llc_weight(cpu) != numa_weight(cpu))
+ return true;
+
+ return false;
+}
+
+/*
+ * Initialize topology-aware scheduling.
+ *
+ * Detect if the system has multiple LLC or multiple NUMA domains and enable
+ * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
+ * selection policy.
+ *
+ * Assumption: the kernel's internal topology representation assumes that each
+ * CPU belongs to a single LLC domain, and that each LLC domain is entirely
+ * contained within a single NUMA node.
+ */
+void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
+{
+ bool enable_llc = false, enable_numa = false;
+ unsigned int nr_cpus;
+ s32 cpu = cpumask_first(cpu_online_mask);
+
+ /*
+ * Enable LLC domain optimization only when there are multiple LLC
+ * domains among the online CPUs. If all online CPUs are part of a
+ * single LLC domain, the idle CPU selection logic can choose any
+ * online CPU without bias.
+ *
+ * Note that it is sufficient to check the LLC domain of the first
+ * online CPU to determine whether a single LLC domain includes all
+ * CPUs.
+ */
+ rcu_read_lock();
+ nr_cpus = llc_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus())
+ enable_llc = true;
+ pr_debug("sched_ext: LLC=%*pb weight=%u\n",
+ cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
+ }
+
+ /*
+ * Enable NUMA optimization only when there are multiple NUMA domains
+ * among the online CPUs and the NUMA domains don't perfectly overlaps
+ * with the LLC domains.
+ *
+ * If all CPUs belong to the same NUMA node and the same LLC domain,
+ * enabling both NUMA and LLC optimizations is unnecessary, as checking
+ * for an idle CPU in the same domain twice is redundant.
+ *
+ * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA
+ * optimization, as we would naturally select idle CPUs within
+ * specific NUMA nodes querying the corresponding per-node cpumask.
+ */
+ if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
+ nr_cpus = numa_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
+ enable_numa = true;
+ pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
+ cpumask_pr_args(numa_span(cpu)), nr_cpus);
+ }
+ }
+ rcu_read_unlock();
+
+ pr_debug("sched_ext: LLC idle selection %s\n",
+ str_enabled_disabled(enable_llc));
+ pr_debug("sched_ext: NUMA idle selection %s\n",
+ str_enabled_disabled(enable_numa));
+
+ if (enable_llc)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
+ if (enable_numa)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
+}
+
+/*
+ * Built-in CPU idle selection policy:
+ *
+ * 1. Prioritize full-idle cores:
+ * - always prioritize CPUs from fully idle cores (both logical CPUs are
+ * idle) to avoid interference caused by SMT.
+ *
+ * 2. Reuse the same CPU:
+ * - prefer the last used CPU to take advantage of cached data (L1, L2) and
+ * branch prediction optimizations.
+ *
+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ * - if the above conditions aren't met, pick a CPU that shares the same LLC
+ * to maintain cache locality.
+ *
+ * 4. Pick a CPU within the same NUMA node, if enabled:
+ * - choose a CPU from the same NUMA node to reduce memory access latency.
+ *
+ * 5. Pick any idle CPU usable by the task.
+ *
+ * Step 3 and 4 are performed only if the system has, respectively,
+ * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
+ * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs.
+ *
+ * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always
+ * begin in @prev_cpu's node and proceed to other nodes in order of
+ * increasing distance.
+ *
+ * Return the picked CPU if idle, or a negative value otherwise.
+ *
+ * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
+ * we never call ops.select_cpu() for them, see select_task_rq().
+ */
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags)
+{
+ const struct cpumask *llc_cpus = NULL;
+ const struct cpumask *numa_cpus = NULL;
+ int node = scx_cpu_node_if_enabled(prev_cpu);
+ s32 cpu;
+
+ /*
+ * This is necessary to protect llc_cpus.
+ */
+ rcu_read_lock();
+
+ /*
+ * Determine the scheduling domain only if the task is allowed to run
+ * on all CPUs.
+ *
+ * This is done primarily for efficiency, as it avoids the overhead of
+ * updating a cpumask every time we need to select an idle CPU (which
+ * can be costly in large SMP systems), but it also aligns logically:
+ * if a task's scheduling domain is restricted by user-space (through
+ * CPU affinity), the task will simply use the flat scheduling domain
+ * defined by user-space.
+ */
+ if (p->nr_cpus_allowed >= num_possible_cpus()) {
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
+ numa_cpus = numa_span(prev_cpu);
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
+ llc_cpus = llc_span(prev_cpu);
+ }
+
+ /*
+ * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
+ */
+ if (wake_flags & SCX_WAKE_SYNC) {
+ int waker_node;
+
+ /*
+ * If the waker's CPU is cache affine and prev_cpu is idle,
+ * then avoid a migration.
+ */
+ cpu = smp_processor_id();
+ if (cpus_share_cache(cpu, prev_cpu) &&
+ scx_idle_test_and_clear_cpu(prev_cpu)) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+
+ /*
+ * If the waker's local DSQ is empty, and the system is under
+ * utilized, try to wake up @p to the local DSQ of the waker.
+ *
+ * Checking only for an empty local DSQ is insufficient as it
+ * could give the wakee an unfair advantage when the system is
+ * oversaturated.
+ *
+ * Checking only for the presence of idle CPUs is also
+ * insufficient as the local DSQ of the waker could have tasks
+ * piled up on it even if there is an idle core elsewhere on
+ * the system.
+ */
+ waker_node = cpu_to_node(cpu);
+ if (!(current->flags & PF_EXITING) &&
+ cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
+ (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
+ !cpumask_empty(idle_cpumask(waker_node)->cpu)) {
+ if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * If CPU has SMT, any wholly idle CPU is likely a better pick than
+ * partially idle @prev_cpu.
+ */
+ if (sched_smt_active()) {
+ /*
+ * Keep using @prev_cpu if it's part of a fully idle core.
+ */
+ if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
+ scx_idle_test_and_clear_cpu(prev_cpu)) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any fully idle core in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any fully idle core in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any full-idle core usable by the task.
+ *
+ * If the node-aware idle CPU selection policy is enabled
+ * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always
+ * begin in prev_cpu's node and proceed to other nodes in
+ * order of increasing distance.
+ */
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto out_unlock;
+
+ /*
+ * Give up if we're strictly looking for a full-idle SMT
+ * core.
+ */
+ if (flags & SCX_PICK_IDLE_CORE) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * Use @prev_cpu if it's idle.
+ */
+ if (scx_idle_test_and_clear_cpu(prev_cpu)) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any idle CPU in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = pick_idle_cpu_in_node(llc_cpus, node, 0);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any idle CPU in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = pick_idle_cpu_in_node(numa_cpus, node, 0);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any idle CPU usable by the task.
+ *
+ * If the node-aware idle CPU selection policy is enabled
+ * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin
+ * in prev_cpu's node and proceed to other nodes in order of
+ * increasing distance.
+ */
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
+ if (cpu >= 0)
+ goto out_unlock;
+
+out_unlock:
+ rcu_read_unlock();
+
+ return cpu;
+}
+
+/*
+ * Initialize global and per-node idle cpumasks.
+ */
+void scx_idle_init_masks(void)
+{
+ int node;
+
+ /* Allocate global idle cpumasks */
+ BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL));
+
+ /* Allocate per-node idle cpumasks */
+ scx_idle_node_masks = kcalloc(num_possible_nodes(),
+ sizeof(*scx_idle_node_masks), GFP_KERNEL);
+ BUG_ON(!scx_idle_node_masks);
+
+ for_each_node(node) {
+ scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks),
+ GFP_KERNEL, node);
+ BUG_ON(!scx_idle_node_masks[node]);
+
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node));
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node));
+ }
+}
+
+static void update_builtin_idle(int cpu, bool idle)
+{
+ int node = scx_cpu_node_if_enabled(cpu);
+ struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
+
+ assign_cpu(cpu, idle_cpus, idle);
+
+#ifdef CONFIG_SCHED_SMT
+ if (sched_smt_active()) {
+ const struct cpumask *smt = cpu_smt_mask(cpu);
+ struct cpumask *idle_smts = idle_cpumask(node)->smt;
+
+ if (idle) {
+ /*
+ * idle_smt handling is racy but that's fine as it's
+ * only for optimization and self-correcting.
+ */
+ if (!cpumask_subset(smt, idle_cpus))
+ return;
+ cpumask_or(idle_smts, idle_smts, smt);
+ } else {
+ cpumask_andnot(idle_smts, idle_smts, smt);
+ }
+ }
+#endif
+}
+
+/*
+ * Update the idle state of a CPU to @idle.
+ *
+ * If @do_notify is true, ops.update_idle() is invoked to notify the scx
+ * scheduler of an actual idle state transition (idle to busy or vice
+ * versa). If @do_notify is false, only the idle state in the idle masks is
+ * refreshed without invoking ops.update_idle().
+ *
+ * This distinction is necessary, because an idle CPU can be "reserved" and
+ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
+ * busy even if no tasks are dispatched. In this case, the CPU may return
+ * to idle without a true state transition. Refreshing the idle masks
+ * without invoking ops.update_idle() ensures accurate idle state tracking
+ * while avoiding unnecessary updates and maintaining balanced state
+ * transitions.
+ */
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+{
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ */
+ if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+
+ /*
+ * Update the idle masks:
+ * - for real idle transitions (do_notify == true)
+ * - for idle-to-idle transitions (indicated by the previous task
+ * being the idle thread, managed by pick_task_idle())
+ *
+ * Skip updating idle masks if the previous task is not the idle
+ * thread, since set_next_task_idle() has already handled it when
+ * transitioning from a task to the idle thread (calling this
+ * function with do_notify == true).
+ *
+ * In this way we can avoid updating the idle masks twice,
+ * unnecessarily.
+ */
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ if (do_notify || is_idle_task(rq->curr))
+ update_builtin_idle(cpu, idle);
+}
+
+static void reset_idle_masks(struct sched_ext_ops *ops)
+{
+ int node;
+
+ /*
+ * Consider all online cpus idle. Should converge to the actual state
+ * quickly.
+ */
+ if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
+ cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask);
+ cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask);
+ return;
+ }
+
+ for_each_node(node) {
+ const struct cpumask *node_mask = cpumask_of_node(node);
+
+ cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask);
+ cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask);
+ }
+}
+#endif /* CONFIG_SMP */
+
+void scx_idle_enable(struct sched_ext_ops *ops)
+{
+ if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))
+ static_branch_enable(&scx_builtin_idle_enabled);
+ else
+ static_branch_disable(&scx_builtin_idle_enabled);
+
+ if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
+ static_branch_enable(&scx_builtin_idle_per_node);
+ else
+ static_branch_disable(&scx_builtin_idle_per_node);
+
+#ifdef CONFIG_SMP
+ reset_idle_masks(ops);
+#endif
+}
+
+void scx_idle_disable(void)
+{
+ static_branch_disable(&scx_builtin_idle_enabled);
+ static_branch_disable(&scx_builtin_idle_per_node);
+}
+
+/********************************************************************************
+ * Helpers that can be called from the BPF scheduler.
+ */
+
+static int validate_node(int node)
+{
+ if (!static_branch_likely(&scx_builtin_idle_per_node)) {
+ scx_ops_error("per-node idle tracking is disabled");
+ return -EOPNOTSUPP;
+ }
+
+ /* Return no entry for NUMA_NO_NODE (not a critical scx error) */
+ if (node == NUMA_NO_NODE)
+ return -ENOENT;
+
+ /* Make sure node is in a valid range */
+ if (node < 0 || node >= nr_node_ids) {
+ scx_ops_error("invalid node %d", node);
+ return -EINVAL;
+ }
+
+ /* Make sure the node is part of the set of possible nodes */
+ if (!node_possible(node)) {
+ scx_ops_error("unavailable node %d", node);
+ return -EINVAL;
+ }
+
+ return node;
+}
+
+__bpf_kfunc_start_defs();
+
+static bool check_builtin_idle_enabled(void)
+{
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ return true;
+
+ scx_ops_error("built-in idle tracking is disabled");
+ return false;
+}
+
+/**
+ * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
+ * trigger an error if @cpu is invalid
+ * @cpu: target CPU
+ */
+__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
+{
+#ifdef CONFIG_NUMA
+ if (!ops_cpu_valid(cpu, NULL))
+ return NUMA_NO_NODE;
+
+ return cpu_to_node(cpu);
+#else
+ return 0;
+#endif
+}
+
+/**
+ * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @is_idle: out parameter indicating whether the returned CPU is idle
+ *
+ * Can only be called from ops.select_cpu() if the built-in CPU selection is
+ * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
+ * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ *
+ * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
+ * currently idle and thus a good candidate for direct dispatching.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
+ u64 wake_flags, bool *is_idle)
+{
+#ifdef CONFIG_SMP
+ s32 cpu;
+#endif
+ if (!ops_cpu_valid(prev_cpu, NULL))
+ goto prev_cpu;
+
+ if (!check_builtin_idle_enabled())
+ goto prev_cpu;
+
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
+ goto prev_cpu;
+
+#ifdef CONFIG_SMP
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ if (cpu >= 0) {
+ *is_idle = true;
+ return cpu;
+ }
+#endif
+
+prev_cpu:
+ *is_idle = false;
+ return prev_cpu;
+}
+
+/**
+ * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
+ * idle-tracking per-CPU cpumask of a target NUMA node.
+ * @node: target NUMA node
+ *
+ * Returns an empty cpumask if idle tracking is not enabled, if @node is
+ * not valid, or running on a UP kernel. In this case the actual error will
+ * be reported to the BPF scheduler via scx_ops_error().
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
+{
+ node = validate_node(node);
+ if (node < 0)
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ return idle_cpumask(node)->cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
+ * per-CPU cpumask.
+ *
+ * Returns an empty mask if idle tracking is not enabled, or running on a
+ * UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
+{
+ if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
+ scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ return cpu_none_mask;
+ }
+
+ if (!check_builtin_idle_enabled())
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ return idle_cpumask(NUMA_NO_NODE)->cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the
+ * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be
+ * used to determine if an entire physical core is free.
+ * @node: target NUMA node
+ *
+ * Returns an empty cpumask if idle tracking is not enabled, if @node is
+ * not valid, or running on a UP kernel. In this case the actual error will
+ * be reported to the BPF scheduler via scx_ops_error().
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
+{
+ node = validate_node(node);
+ if (node < 0)
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ if (sched_smt_active())
+ return idle_cpumask(node)->smt;
+ else
+ return idle_cpumask(node)->cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
+ * per-physical-core cpumask. Can be used to determine if an entire physical
+ * core is free.
+ *
+ * Returns an empty mask if idle tracking is not enabled, or running on a
+ * UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
+{
+ if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
+ scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ return cpu_none_mask;
+ }
+
+ if (!check_builtin_idle_enabled())
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ if (sched_smt_active())
+ return idle_cpumask(NUMA_NO_NODE)->smt;
+ else
+ return idle_cpumask(NUMA_NO_NODE)->cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
+ * either the percpu, or SMT idle-tracking cpumask.
+ * @idle_mask: &cpumask to use
+ */
+__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
+{
+ /*
+ * Empty function body because we aren't actually acquiring or releasing
+ * a reference to a global idle cpumask, which is read-only in the
+ * caller and is never released. The acquire / release semantics here
+ * are just used to make the cpumask a trusted pointer in the caller.
+ */
+}
+
+/**
+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
+ * @cpu: cpu to test and clear idle for
+ *
+ * Returns %true if @cpu was idle and its idle state was successfully cleared.
+ * %false otherwise.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ */
+__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
+{
+ if (!check_builtin_idle_enabled())
+ return false;
+
+ if (ops_cpu_valid(cpu, NULL))
+ return scx_idle_test_and_clear_cpu(cpu);
+ else
+ return false;
+}
+
+/**
+ * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node
+ * @cpus_allowed: Allowed cpumask
+ * @node: target NUMA node
+ * @flags: %SCX_PICK_IDLE_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
+ *
+ * Returns the picked idle cpu number on success, or -%EBUSY if no matching
+ * cpu was found.
+ *
+ * The search starts from @node and proceeds to other online NUMA nodes in
+ * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified,
+ * in which case the search is limited to the target @node).
+ *
+ * Always returns an error if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if
+ * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set.
+ */
+__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
+ int node, u64 flags)
+{
+ node = validate_node(node);
+ if (node < 0)
+ return node;
+
+ return scx_pick_idle_cpu(cpus_allowed, node, flags);
+}
+
+/**
+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
+ * number on success. -%EBUSY if no matching cpu was found.
+ *
+ * Idle CPU tracking may race against CPU scheduling state transitions. For
+ * example, this function may return -%EBUSY as CPUs are transitioning into the
+ * idle state. If the caller then assumes that there will be dispatch events on
+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
+ * event in the near future.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ *
+ * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use
+ * scx_bpf_pick_idle_cpu_node() instead.
+ */
+__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
+ u64 flags)
+{
+ if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
+ scx_ops_error("per-node idle tracking is enabled");
+ return -EBUSY;
+ }
+
+ if (!check_builtin_idle_enabled())
+ return -EBUSY;
+
+ return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
+}
+
+/**
+ * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available
+ * or pick any CPU from @node
+ * @cpus_allowed: Allowed cpumask
+ * @node: target NUMA node
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * The search starts from @node and proceeds to other online NUMA nodes in
+ * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified,
+ * in which case the search is limited to the target @node, regardless of
+ * the CPU idle state).
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ */
+__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
+ int node, u64 flags)
+{
+ s32 cpu;
+
+ node = validate_node(node);
+ if (node < 0)
+ return node;
+
+ cpu = scx_pick_idle_cpu(cpus_allowed, node, flags);
+ if (cpu >= 0)
+ return cpu;
+
+ if (flags & SCX_PICK_IDLE_IN_NODE)
+ cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed);
+ else
+ cpu = cpumask_any_distribute(cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ else
+ return -EBUSY;
+}
+
+/**
+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ *
+ * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use
+ * scx_bpf_pick_any_cpu_node() instead.
+ */
+__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
+ u64 flags)
+{
+ s32 cpu;
+
+ if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
+ scx_ops_error("per-node idle tracking is enabled");
+ return -EBUSY;
+ }
+
+ if (static_branch_likely(&scx_builtin_idle_enabled)) {
+ cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
+ if (cpu >= 0)
+ return cpu;
+ }
+
+ cpu = cpumask_any_distribute(cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ else
+ return -EBUSY;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_idle)
+BTF_ID_FLAGS(func, scx_bpf_cpu_node)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_idle)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_idle,
+};
+
+BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_select_cpu,
+};
+
+int scx_idle_init(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle);
+
+ return ret;
+}
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
new file mode 100644
index 000000000000..511cc2221f7a
--- /dev/null
+++ b/kernel/sched/ext_idle.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
+ */
+#ifndef _KERNEL_SCHED_EXT_IDLE_H
+#define _KERNEL_SCHED_EXT_IDLE_H
+
+struct sched_ext_ops;
+
+#ifdef CONFIG_SMP
+void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
+void scx_idle_init_masks(void);
+bool scx_idle_test_and_clear_cpu(int cpu);
+s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags);
+#else /* !CONFIG_SMP */
+static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {}
+static inline void scx_idle_init_masks(void) {}
+static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; }
+static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ return -EBUSY;
+}
+#endif /* CONFIG_SMP */
+
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags);
+void scx_idle_enable(struct sched_ext_ops *ops);
+void scx_idle_disable(void);
+int scx_idle_init(void);
+
+#endif /* _KERNEL_SCHED_EXT_IDLE_H */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c798d2795243..e43993a4e580 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -74,12 +74,12 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
*
- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_base_slice = 750000ULL;
-static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
+unsigned int sysctl_sched_base_slice = 700000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 700000ULL;
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL;
static int __init setup_sched_thermal_decay_shift(char *str)
{
@@ -399,7 +399,7 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
static inline void assert_list_leaf_cfs_rq(struct rq *rq)
{
- SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
+ WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
/* Iterate through all leaf cfs_rq's on a runqueue */
@@ -696,7 +696,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 vlag, limit;
- SCHED_WARN_ON(!se->on_rq);
+ WARN_ON_ONCE(!se->on_rq);
vlag = avg_vruntime(cfs_rq) - se->vruntime;
limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
@@ -884,6 +884,26 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
}
/*
+ * HACK, stash a copy of deadline at the point of pick in vlag,
+ * which isn't used until dequeue.
+ */
+static inline void set_protect_slice(struct sched_entity *se)
+{
+ se->vlag = se->deadline;
+}
+
+static inline bool protect_slice(struct sched_entity *se)
+{
+ return se->vlag == se->deadline;
+}
+
+static inline void cancel_protect_slice(struct sched_entity *se)
+{
+ if (protect_slice(se))
+ se->vlag = se->deadline + 1;
+}
+
+/*
* Earliest Eligible Virtual Deadline First
*
* In order to provide latency guarantees for different request sizes
@@ -919,11 +939,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
- /*
- * Once selected, run a task until it either becomes non-eligible or
- * until it gets a new slice. See the HACK in set_next_entity().
- */
- if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr))
return curr;
/* Pick the leftmost entity if it's eligible */
@@ -967,7 +983,6 @@ found:
return best;
}
-#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -994,7 +1009,6 @@ int sched_update_scaling(void)
return 0;
}
#endif
-#endif
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
@@ -3301,7 +3315,7 @@ static void task_numa_work(struct callback_head *work)
bool vma_pids_skipped;
bool vma_pids_forced = false;
- SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
work->next = work;
/*
@@ -4020,7 +4034,7 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa)
* Make sure that rounding and/or propagation of PELT values never
* break this.
*/
- SCHED_WARN_ON(sa->load_avg ||
+ WARN_ON_ONCE(sa->load_avg ||
sa->util_avg ||
sa->runnable_avg);
@@ -5444,7 +5458,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
clear_buddies(cfs_rq, se);
if (flags & DEQUEUE_DELAYED) {
- SCHED_WARN_ON(!se->sched_delayed);
+ WARN_ON_ONCE(!se->sched_delayed);
} else {
bool delay = sleep;
/*
@@ -5454,7 +5468,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & DEQUEUE_SPECIAL)
delay = false;
- SCHED_WARN_ON(delay && se->sched_delayed);
+ WARN_ON_ONCE(delay && se->sched_delayed);
if (sched_feat(DELAY_DEQUEUE) && delay &&
!entity_eligible(cfs_rq, se)) {
@@ -5530,15 +5544,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
- /*
- * HACK, stash a copy of deadline at the point of pick in vlag,
- * which isn't used until dequeue.
- */
- se->vlag = se->deadline;
+
+ set_protect_slice(se);
}
update_stats_curr_start(cfs_rq, se);
- SCHED_WARN_ON(cfs_rq->curr);
+ WARN_ON_ONCE(cfs_rq->curr);
cfs_rq->curr = se;
/*
@@ -5579,7 +5590,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
if (sched_feat(PICK_BUDDY) &&
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
/* ->next will never be delayed */
- SCHED_WARN_ON(cfs_rq->next->sched_delayed);
+ WARN_ON_ONCE(cfs_rq->next->sched_delayed);
return cfs_rq->next;
}
@@ -5615,7 +5626,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
- SCHED_WARN_ON(cfs_rq->curr != prev);
+ WARN_ON_ONCE(cfs_rq->curr != prev);
cfs_rq->curr = NULL;
}
@@ -5838,7 +5849,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttled_clock_self = 0;
- if (SCHED_WARN_ON((s64)delta < 0))
+ if (WARN_ON_ONCE((s64)delta < 0))
delta = 0;
cfs_rq->throttled_clock_self_time += delta;
@@ -5858,7 +5869,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
- SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+ WARN_ON_ONCE(cfs_rq->throttled_clock_self);
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock_self = rq_clock(rq);
}
@@ -5967,7 +5978,7 @@ done:
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
- SCHED_WARN_ON(cfs_rq->throttled_clock);
+ WARN_ON_ONCE(cfs_rq->throttled_clock);
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock = rq_clock(rq);
return true;
@@ -6123,7 +6134,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
}
/* Already enqueued */
- if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+ if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list)))
return;
first = list_empty(&rq->cfsb_csd_list);
@@ -6142,7 +6153,7 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
lockdep_assert_rq_held(rq_of(cfs_rq));
- if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+ if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) ||
cfs_rq->runtime_remaining <= 0))
return;
@@ -6178,7 +6189,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
goto next;
/* By the above checks, this should never be true */
- SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+ WARN_ON_ONCE(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
runtime = -cfs_rq->runtime_remaining + 1;
@@ -6199,7 +6210,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
* We currently only expect to be unthrottling
* a single cfs_rq locally.
*/
- SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ WARN_ON_ONCE(!list_empty(&local_unthrottle));
list_add_tail(&cfs_rq->throttled_csd_list,
&local_unthrottle);
}
@@ -6224,7 +6235,7 @@ next:
rq_unlock_irqrestore(rq, &rf);
}
- SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ WARN_ON_ONCE(!list_empty(&local_unthrottle));
rcu_read_unlock();
@@ -6541,14 +6552,14 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren
cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
- hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
- cfs_b->period_timer.function = sched_cfs_period_timer;
+ hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED);
/* Add a random offset so that timers interleave */
hrtimer_set_expires(&cfs_b->period_timer,
get_random_u32_below(cfs_b->period));
- hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- cfs_b->slack_timer.function = sched_cfs_slack_timer;
+ hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
cfs_b->slack_started = false;
}
@@ -6776,7 +6787,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- SCHED_WARN_ON(task_rq(p) != rq);
+ WARN_ON_ONCE(task_rq(p) != rq);
if (rq->cfs.h_nr_queued > 1) {
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
@@ -6887,8 +6898,8 @@ requeue_delayed_entity(struct sched_entity *se)
* Because a delayed entity is one that is still on
* the runqueue competing until elegibility.
*/
- SCHED_WARN_ON(!se->sched_delayed);
- SCHED_WARN_ON(!se->on_rq);
+ WARN_ON_ONCE(!se->sched_delayed);
+ WARN_ON_ONCE(!se->on_rq);
if (sched_feat(DELAY_ZERO)) {
update_entity_lag(cfs_rq, se);
@@ -6991,6 +7002,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_group(se);
se->slice = slice;
+ if (se != cfs_rq->curr)
+ min_vruntime_cb_propagate(&se->run_node, NULL);
slice = cfs_rq_min_slice(cfs_rq);
cfs_rq->h_nr_runnable += h_nr_runnable;
@@ -7120,6 +7133,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
update_cfs_group(se);
se->slice = slice;
+ if (se != cfs_rq->curr)
+ min_vruntime_cb_propagate(&se->run_node, NULL);
slice = cfs_rq_min_slice(cfs_rq);
cfs_rq->h_nr_runnable -= h_nr_runnable;
@@ -7144,8 +7159,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
rq->next_balance = jiffies;
if (p && task_delayed) {
- SCHED_WARN_ON(!task_sleep);
- SCHED_WARN_ON(p->on_rq != 1);
+ WARN_ON_ONCE(!task_sleep);
+ WARN_ON_ONCE(p->on_rq != 1);
/* Fix-up what dequeue_task_fair() skipped */
hrtick_update(rq);
@@ -8723,7 +8738,7 @@ static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
static void set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
+ if (WARN_ON_ONCE(!se->on_rq))
return;
if (se_is_idle(se))
return;
@@ -8783,8 +8798,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* Preempt an idle entity in favor of a non-idle entity (and don't preempt
* in the inverse case).
*/
- if (cse_is_idle && !pse_is_idle)
+ if (cse_is_idle && !pse_is_idle) {
+ /*
+ * When non-idle entity preempt an idle entity,
+ * don't give idle entity slice protection.
+ */
+ cancel_protect_slice(se);
goto preempt;
+ }
+
if (cse_is_idle != pse_is_idle)
return;
@@ -8803,8 +8825,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* Note that even if @p does not turn out to be the most eligible
* task at this moment, current's slice protection will be lost.
*/
- if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
- se->vlag = se->deadline + 1;
+ if (do_preempt_short(cfs_rq, pse, se))
+ cancel_protect_slice(se);
/*
* If @p has become the most eligible task, force preemption.
@@ -9417,12 +9439,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 0;
/* Prevent to re-select dst_cpu via env's CPUs: */
- for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
- env->flags |= LBF_DST_PINNED;
- env->new_dst_cpu = cpu;
- break;
- }
+ cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr);
+
+ if (cpu < nr_cpu_ids) {
+ env->flags |= LBF_DST_PINNED;
+ env->new_dst_cpu = cpu;
}
return 0;
@@ -12461,7 +12482,7 @@ unlock:
void nohz_balance_exit_idle(struct rq *rq)
{
- SCHED_WARN_ON(rq != this_rq());
+ WARN_ON_ONCE(rq != this_rq());
if (likely(!rq->nohz_tick_stopped))
return;
@@ -12497,7 +12518,7 @@ void nohz_balance_enter_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- SCHED_WARN_ON(cpu != smp_processor_id());
+ WARN_ON_ONCE(cpu != smp_processor_id());
/* If this CPU is going down, then nothing needs to be done: */
if (!cpu_active(cpu))
@@ -12580,7 +12601,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
int balance_cpu;
struct rq *rq;
- SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
+ WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
/*
* We assume there will be no idle load after this update and clear
@@ -13020,7 +13041,7 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
struct cfs_rq *cfs_rqb;
s64 delta;
- SCHED_WARN_ON(task_rq(b)->core != rq->core);
+ WARN_ON_ONCE(task_rq(b)->core != rq->core);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -13223,7 +13244,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
- SCHED_WARN_ON(p->se.sched_delayed);
+ WARN_ON_ONCE(p->se.sched_delayed);
attach_task_cfs_rq(p);
@@ -13258,7 +13279,7 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
if (!first)
return;
- SCHED_WARN_ON(se->sched_delayed);
+ WARN_ON_ONCE(se->sched_delayed);
if (hrtick_enabled_fair(rq))
hrtick_start_fair(rq, p);
@@ -13645,7 +13666,6 @@ DEFINE_SCHED_CLASS(fair) = {
#endif
};
-#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
struct cfs_rq *cfs_rq, *pos;
@@ -13679,7 +13699,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
rcu_read_unlock();
}
#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
__init void init_sched_fair_class(void)
{
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4b8e33c615b1..fa03ec3ed56a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -127,9 +127,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
raw_spin_lock_init(&rt_b->rt_runtime_lock);
- hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL_HARD);
- rt_b->rt_period_timer.function = sched_rt_period_timer;
+ hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_HARD);
}
static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
@@ -169,9 +168,8 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
-#ifdef CONFIG_SCHED_DEBUG
WARN_ON_ONCE(!rt_entity_is_task(rt_se));
-#endif
+
return container_of(rt_se, struct task_struct, rt);
}
@@ -1713,7 +1711,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
BUG_ON(idx >= MAX_RT_PRIO);
queue = array->queue + idx;
- if (SCHED_WARN_ON(list_empty(queue)))
+ if (WARN_ON_ONCE(list_empty(queue)))
return NULL;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
@@ -2910,6 +2908,7 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
int ret;
mutex_lock(&mutex);
+ sched_domains_mutex_lock();
old_period = sysctl_sched_rt_period;
old_runtime = sysctl_sched_rt_runtime;
@@ -2936,6 +2935,7 @@ undo:
sysctl_sched_rt_period = old_period;
sysctl_sched_rt_runtime = old_runtime;
}
+ sched_domains_mutex_unlock();
mutex_unlock(&mutex);
return ret;
@@ -2967,7 +2967,6 @@ static int sched_rr_handler(const struct ctl_table *table, int write, void *buff
}
#endif /* CONFIG_SYSCTL */
-#ifdef CONFIG_SCHED_DEBUG
void print_rt_stats(struct seq_file *m, int cpu)
{
rt_rq_iter_t iter;
@@ -2978,4 +2977,3 @@ void print_rt_stats(struct seq_file *m, int cpu)
print_rt_rq(m, cpu, rt_rq);
rcu_read_unlock();
}
-#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 023b844159c9..47972f34ea70 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -91,12 +91,6 @@ struct cpuidle_state;
#include "cpupri.h"
#include "cpudeadline.h"
-#ifdef CONFIG_SCHED_DEBUG
-# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
-#else
-# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
-#endif
-
/* task_struct::on_rq states: */
#define TASK_ON_RQ_QUEUED 1
#define TASK_ON_RQ_MIGRATING 2
@@ -998,7 +992,7 @@ struct root_domain {
* Also, some corner cases, like 'wrap around' is dangerous, but given
* that u64 is 'big enough'. So that shouldn't be a concern.
*/
- u64 visit_gen;
+ u64 visit_cookie;
#ifdef HAVE_RT_PUSH_IPI
/*
@@ -1180,10 +1174,8 @@ struct rq {
atomic_t nr_iowait;
-#ifdef CONFIG_SCHED_DEBUG
u64 last_seen_need_resched_ns;
int ticks_without_resched;
-#endif
#ifdef CONFIG_MEMBARRIER
int membarrier_state;
@@ -1571,7 +1563,7 @@ static inline void update_idle_core(struct rq *rq) { }
static inline struct task_struct *task_of(struct sched_entity *se)
{
- SCHED_WARN_ON(!entity_is_task(se));
+ WARN_ON_ONCE(!entity_is_task(se));
return container_of(se, struct task_struct, se);
}
@@ -1652,7 +1644,7 @@ static inline void assert_clock_updated(struct rq *rq)
* The only reason for not seeing a clock update since the
* last rq_pin_lock() is if we're currently skipping updates.
*/
- SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP);
+ WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP);
}
static inline u64 rq_clock(struct rq *rq)
@@ -1699,7 +1691,7 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq)
static inline void rq_clock_start_loop_update(struct rq *rq)
{
lockdep_assert_rq_held(rq);
- SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP);
+ WARN_ON_ONCE(rq->clock_update_flags & RQCF_ACT_SKIP);
rq->clock_update_flags |= RQCF_ACT_SKIP;
}
@@ -1712,14 +1704,12 @@ static inline void rq_clock_stop_loop_update(struct rq *rq)
struct rq_flags {
unsigned long flags;
struct pin_cookie cookie;
-#ifdef CONFIG_SCHED_DEBUG
/*
* A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
* current pin context is stashed here in case it needs to be
* restored in rq_repin_lock().
*/
unsigned int clock_update_flags;
-#endif
};
extern struct balance_callback balance_push_callback;
@@ -1770,21 +1760,18 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
rf->cookie = lockdep_pin_lock(__rq_lockp(rq));
-#ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
-# ifdef CONFIG_SMP
- SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback);
-# endif
+#ifdef CONFIG_SMP
+ WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
#endif
}
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
{
-#ifdef CONFIG_SCHED_DEBUG
if (rq->clock_update_flags > RQCF_ACT_SKIP)
rf->clock_update_flags = RQCF_UPDATED;
-#endif
+
scx_rq_clock_invalidate(rq);
lockdep_unpin_lock(__rq_lockp(rq), rf->cookie);
}
@@ -1793,12 +1780,10 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
{
lockdep_repin_lock(__rq_lockp(rq), rf->cookie);
-#ifdef CONFIG_SCHED_DEBUG
/*
* Restore the value we stashed in @rf for this pin context.
*/
rq->clock_update_flags |= rf->clock_update_flags;
-#endif
}
extern
@@ -2072,9 +2057,7 @@ struct sched_group_capacity {
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
-#ifdef CONFIG_SCHED_DEBUG
int id;
-#endif
unsigned long cpumask[]; /* Balance mask */
};
@@ -2114,13 +2097,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
extern int group_balance_cpu(struct sched_group *sg);
-#ifdef CONFIG_SCHED_DEBUG
extern void update_sched_domain_debugfs(void);
extern void dirty_sched_domain_sysctl(int cpu);
-#else
-static inline void update_sched_domain_debugfs(void) { }
-static inline void dirty_sched_domain_sysctl(int cpu) { }
-#endif
extern int sched_update_scaling(void);
@@ -2200,13 +2178,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
}
/*
- * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ * Tunables:
*/
-#ifdef CONFIG_SCHED_DEBUG
-# define const_debug __read_mostly
-#else
-# define const_debug const
-#endif
#define SCHED_FEAT(name, enabled) \
__SCHED_FEAT_##name ,
@@ -2218,13 +2191,11 @@ enum {
#undef SCHED_FEAT
-#ifdef CONFIG_SCHED_DEBUG
-
/*
* To support run-time toggling of sched features, all the translation units
* (but core.c) reference the sysctl_sched_features defined in core.c.
*/
-extern const_debug unsigned int sysctl_sched_features;
+extern __read_mostly unsigned int sysctl_sched_features;
#ifdef CONFIG_JUMP_LABEL
@@ -2246,24 +2217,6 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#endif /* !CONFIG_JUMP_LABEL */
-#else /* !SCHED_DEBUG: */
-
-/*
- * Each translation unit has its own copy of sysctl_sched_features to allow
- * constants propagation at compile time and compiler optimization based on
- * features default.
- */
-#define SCHED_FEAT(name, enabled) \
- (1UL << __SCHED_FEAT_##name) * enabled |
-static const_debug __maybe_unused unsigned int sysctl_sched_features =
-#include "features.h"
- 0;
-#undef SCHED_FEAT
-
-#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-
-#endif /* !SCHED_DEBUG */
-
extern struct static_key_false sched_numa_balancing;
extern struct static_key_false sched_schedstats;
@@ -2685,7 +2638,7 @@ static inline void idle_set_state(struct rq *rq,
static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
- SCHED_WARN_ON(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held());
return rq->idle_state;
}
@@ -2843,12 +2796,11 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
# define SCHED_NR_MIGRATE_BREAK 32
#endif
-extern const_debug unsigned int sysctl_sched_nr_migrate;
-extern const_debug unsigned int sysctl_sched_migration_cost;
+extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_base_slice;
-#ifdef CONFIG_SCHED_DEBUG
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;
@@ -2859,7 +2811,6 @@ extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
extern unsigned int sysctl_numa_balancing_hot_threshold;
-#endif
#ifdef CONFIG_SCHED_HRTICK
@@ -2932,7 +2883,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
}
#endif
-#ifdef CONFIG_SCHED_DEBUG
/*
* In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
* acquire rq lock instead of rq_lock(). So at the end of these two functions
@@ -2947,9 +2897,6 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
#endif
}
-#else
-static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { }
-#endif
#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
@@ -3162,7 +3109,6 @@ extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
-#ifdef CONFIG_SCHED_DEBUG
extern bool sched_debug_verbose;
extern void print_cfs_stats(struct seq_file *m, int cpu);
@@ -3173,15 +3119,12 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
extern void resched_latency_warn(int cpu, u64 latency);
-# ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_NUMA_BALANCING
extern void show_numa_stats(struct task_struct *p, struct seq_file *m);
extern void
print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
unsigned long tpf, unsigned long gsf, unsigned long gpf);
-# endif /* CONFIG_NUMA_BALANCING */
-#else /* !CONFIG_SCHED_DEBUG: */
-static inline void resched_latency_warn(int cpu, u64 latency) { }
-#endif /* !CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_NUMA_BALANCING */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq);
@@ -3394,6 +3337,31 @@ static inline bool update_other_load_avgs(struct rq *rq) { return false; }
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace performs
+ * an operation that requires it.
+ *
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq level
+ * hence is active.
+ */
+static inline bool uclamp_is_used(void)
+{
+ return static_branch_likely(&sched_uclamp_used);
+}
+
+/*
+ * Enabling static branches would get the cpus_read_lock(),
+ * check whether uclamp_is_used before enable it to avoid always
+ * calling cpus_read_lock(). Because we never disable this
+ * static key once enable it.
+ */
+static inline void sched_uclamp_enable(void)
+{
+ if (!uclamp_is_used())
+ static_branch_enable(&sched_uclamp_used);
+}
+
static inline unsigned long uclamp_rq_get(struct rq *rq,
enum uclamp_id clamp_id)
{
@@ -3417,7 +3385,7 @@ static inline bool uclamp_rq_is_capped(struct rq *rq)
unsigned long rq_util;
unsigned long max_util;
- if (!static_branch_likely(&sched_uclamp_used))
+ if (!uclamp_is_used())
return false;
rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
@@ -3426,19 +3394,6 @@ static inline bool uclamp_rq_is_capped(struct rq *rq)
return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util;
}
-/*
- * When uclamp is compiled in, the aggregation at rq level is 'turned off'
- * by default in the fast path and only gets turned on once userspace performs
- * an operation that requires it.
- *
- * Returns true if userspace opted-in to use uclamp and aggregation at rq level
- * hence is active.
- */
-static inline bool uclamp_is_used(void)
-{
- return static_branch_likely(&sched_uclamp_used);
-}
-
#define for_each_clamp_id(clamp_id) \
for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
@@ -3486,6 +3441,8 @@ static inline bool uclamp_is_used(void)
return false;
}
+static inline void sched_uclamp_enable(void) {}
+
static inline unsigned long
uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id)
{
@@ -3619,6 +3576,7 @@ extern int preempt_dynamic_mode;
extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
#endif
+extern const char *preempt_modes[];
#ifdef CONFIG_SCHED_MM_CID
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 19cdbe96f93d..452826df6ae1 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -144,7 +144,7 @@ static inline void psi_enqueue(struct task_struct *p, int flags)
if (p->se.sched_delayed) {
/* CPU migration of "sleeping" task */
- SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
+ WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED));
if (p->in_memstall)
set |= TSK_MEMSTALL;
if (p->in_iowait)
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 456d339be98f..c326de1344fb 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -368,7 +368,7 @@ static int uclamp_validate(struct task_struct *p,
* blocking operation which obviously cannot be done while holding
* scheduler locks.
*/
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
return 0;
}
@@ -875,7 +875,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
struct sched_param lparam;
- if (!param || pid < 0)
+ if (unlikely(!param || pid < 0))
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
@@ -984,7 +984,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
struct sched_attr attr;
int retval;
- if (!uattr || pid < 0 || flags)
+ if (unlikely(!uattr || pid < 0 || flags))
return -EINVAL;
retval = sched_copy_attr(uattr, &attr);
@@ -1049,7 +1049,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
struct task_struct *p;
int retval;
- if (!param || pid < 0)
+ if (unlikely(!param || pid < 0))
return -EINVAL;
scoped_guard (rcu) {
@@ -1085,8 +1085,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
struct task_struct *p;
int retval;
- if (!uattr || pid < 0 || usize > PAGE_SIZE ||
- usize < SCHED_ATTR_SIZE_VER0 || flags)
+ if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags))
return -EINVAL;
scoped_guard (rcu) {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index c49aea8c1025..f1ebc60d967f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -6,13 +6,19 @@
#include <linux/bsearch.h>
DEFINE_MUTEX(sched_domains_mutex);
+void sched_domains_mutex_lock(void)
+{
+ mutex_lock(&sched_domains_mutex);
+}
+void sched_domains_mutex_unlock(void)
+{
+ mutex_unlock(&sched_domains_mutex);
+}
/* Protected by sched_domains_mutex: */
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
-#ifdef CONFIG_SCHED_DEBUG
-
static int __init sched_debug_setup(char *str)
{
sched_debug_verbose = true;
@@ -151,15 +157,6 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
break;
}
}
-#else /* !CONFIG_SCHED_DEBUG */
-
-# define sched_debug_verbose 0
-# define sched_domain_debug(sd, cpu) do { } while (0)
-static inline bool sched_debug(void)
-{
- return false;
-}
-#endif /* CONFIG_SCHED_DEBUG */
/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
@@ -560,7 +557,7 @@ static int init_rootdomain(struct root_domain *rd)
rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
#endif
- rd->visit_gen = 0;
+ rd->visit_cookie = 0;
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask;
@@ -2275,9 +2272,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sgc)
return -ENOMEM;
-#ifdef CONFIG_SCHED_DEBUG
sgc->id = j;
-#endif
*per_cpu_ptr(sdd->sgc, j) = sgc;
}
@@ -2680,7 +2675,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
*
* Call with hotplug lock and sched_domains_mutex held
*/
-void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
+static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new)
{
bool __maybe_unused has_eas = false;
@@ -2712,21 +2707,8 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
for (i = 0; i < ndoms_cur; i++) {
for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_cur[i], doms_new[j]) &&
- dattrs_equal(dattr_cur, i, dattr_new, j)) {
- struct root_domain *rd;
-
- /*
- * This domain won't be destroyed and as such
- * its dl_bw->total_bw needs to be cleared.
- * Tasks contribution will be then recomputed
- * in function dl_update_tasks_root_domain(),
- * dl_servers contribution in function
- * dl_restore_server_root_domain().
- */
- rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
- dl_clear_root_domain(rd);
+ dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
- }
}
/* No match - a current sched domain not in new doms_new[] */
detach_destroy_domains(doms_cur[i]);
@@ -2783,6 +2765,7 @@ match3:
ndoms_cur = ndoms_new;
update_sched_domain_debugfs();
+ dl_rebuild_rd_accounting();
}
/*
@@ -2791,7 +2774,7 @@ match3:
void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new)
{
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 7bbb408431eb..41aa761c7738 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -29,13 +29,11 @@
#include <linux/syscalls.h>
#include <linux/sysctl.h>
+#include <asm/syscall.h>
+
/* Not exposed in headers: strictly internal use only. */
#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1)
-#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
-#include <asm/syscall.h>
-#endif
-
#ifdef CONFIG_SECCOMP_FILTER
#include <linux/file.h>
#include <linux/filter.h>
@@ -576,6 +574,9 @@ void seccomp_filter_release(struct task_struct *tsk)
if (WARN_ON((tsk->flags & PF_EXITING) == 0))
return;
+ if (READ_ONCE(tsk->seccomp.filter) == NULL)
+ return;
+
spin_lock_irq(&tsk->sighand->siglock);
orig = tsk->seccomp.filter;
/* Detach task from its filter tree. */
@@ -601,6 +602,13 @@ static inline void seccomp_sync_threads(unsigned long flags)
BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
assert_spin_locked(&current->sighand->siglock);
+ /*
+ * Don't touch any of the threads if the process is being killed.
+ * This allows for a lockless check in seccomp_filter_release.
+ */
+ if (current->signal->flags & SIGNAL_GROUP_EXIT)
+ return;
+
/* Synchronize all threads. */
caller = current;
for_each_thread(caller, thread) {
@@ -1074,6 +1082,13 @@ void secure_computing_strict(int this_syscall)
else
BUG();
}
+int __secure_computing(void)
+{
+ int this_syscall = syscall_get_nr(current, current_pt_regs());
+
+ secure_computing_strict(this_syscall);
+ return 0;
+}
#else
#ifdef CONFIG_SECCOMP_FILTER
@@ -1225,13 +1240,12 @@ out:
return -1;
}
-static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
- const bool recheck_after_trace)
+static int __seccomp_filter(int this_syscall, const bool recheck_after_trace)
{
u32 filter_ret, action;
+ struct seccomp_data sd;
struct seccomp_filter *match = NULL;
int data;
- struct seccomp_data sd_local;
/*
* Make sure that any changes to mode from another thread have
@@ -1239,12 +1253,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
*/
smp_rmb();
- if (!sd) {
- populate_seccomp_data(&sd_local);
- sd = &sd_local;
- }
+ populate_seccomp_data(&sd);
- filter_ret = seccomp_run_filters(sd, &match);
+ filter_ret = seccomp_run_filters(&sd, &match);
data = filter_ret & SECCOMP_RET_DATA;
action = filter_ret & SECCOMP_RET_ACTION_FULL;
@@ -1302,13 +1313,13 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
* a reload of all registers. This does not goto skip since
* a skip would have already been reported.
*/
- if (__seccomp_filter(this_syscall, NULL, true))
+ if (__seccomp_filter(this_syscall, true))
return -1;
return 0;
case SECCOMP_RET_USER_NOTIF:
- if (seccomp_do_user_notification(this_syscall, match, sd))
+ if (seccomp_do_user_notification(this_syscall, match, &sd))
goto skip;
return 0;
@@ -1350,8 +1361,7 @@ skip:
return -1;
}
#else
-static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
- const bool recheck_after_trace)
+static int __seccomp_filter(int this_syscall, const bool recheck_after_trace)
{
BUG();
@@ -1359,7 +1369,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
}
#endif
-int __secure_computing(const struct seccomp_data *sd)
+int __secure_computing(void)
{
int mode = current->seccomp.mode;
int this_syscall;
@@ -1368,15 +1378,14 @@ int __secure_computing(const struct seccomp_data *sd)
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
return 0;
- this_syscall = sd ? sd->nr :
- syscall_get_nr(current, current_pt_regs());
+ this_syscall = syscall_get_nr(current, current_pt_regs());
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
return 0;
case SECCOMP_MODE_FILTER:
- return __seccomp_filter(this_syscall, sd, false);
+ return __seccomp_filter(this_syscall, false);
/* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */
case SECCOMP_MODE_DEAD:
WARN_ON_ONCE(1);
diff --git a/kernel/signal.c b/kernel/signal.c
index 875e97f6205a..86ba66d95da5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2092,7 +2092,7 @@ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueu
* from a non-periodic timer, then just drop the reference
* count. Otherwise queue it on the ignored list.
*/
- if (tmr->it_signal && tmr->it_sig_periodic)
+ if (posixtimer_valid(tmr) && tmr->it_sig_periodic)
hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers);
else
posixtimer_putref(tmr);
@@ -2180,8 +2180,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
WARN_ON_ONCE(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
/*
- * tsk is a group leader and has no threads, wake up the
- * non-PIDFD_THREAD waiters.
+ * Notify for thread-group leaders without subthreads.
*/
if (thread_group_empty(tsk))
do_notify_pidfd(tsk);
@@ -4009,6 +4008,47 @@ static struct pid *pidfd_to_pid(const struct file *file)
(PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
PIDFD_SIGNAL_PROCESS_GROUP)
+static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type,
+ siginfo_t __user *info, unsigned int flags)
+{
+ kernel_siginfo_t kinfo;
+
+ switch (flags) {
+ case PIDFD_SIGNAL_THREAD:
+ type = PIDTYPE_PID;
+ break;
+ case PIDFD_SIGNAL_THREAD_GROUP:
+ type = PIDTYPE_TGID;
+ break;
+ case PIDFD_SIGNAL_PROCESS_GROUP:
+ type = PIDTYPE_PGID;
+ break;
+ }
+
+ if (info) {
+ int ret;
+
+ ret = copy_siginfo_from_user_any(&kinfo, info);
+ if (unlikely(ret))
+ return ret;
+
+ if (unlikely(sig != kinfo.si_signo))
+ return -EINVAL;
+
+ /* Only allow sending arbitrary signals to yourself. */
+ if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
+ (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
+ return -EPERM;
+ } else {
+ prepare_kill_siginfo(sig, &kinfo, type);
+ }
+
+ if (type == PIDTYPE_PGID)
+ return kill_pgrp_info(sig, &kinfo, pid);
+
+ return kill_pid_info_type(sig, &kinfo, pid, type);
+}
+
/**
* sys_pidfd_send_signal - Signal a process through a pidfd
* @pidfd: file descriptor of the process
@@ -4026,9 +4066,7 @@ static struct pid *pidfd_to_pid(const struct file *file)
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
siginfo_t __user *, info, unsigned int, flags)
{
- int ret;
struct pid *pid;
- kernel_siginfo_t kinfo;
enum pid_type type;
/* Enforce flags be set to 0 until we add an extension. */
@@ -4039,57 +4077,39 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
return -EINVAL;
- CLASS(fd, f)(pidfd);
- if (fd_empty(f))
- return -EBADF;
+ switch (pidfd) {
+ case PIDFD_SELF_THREAD:
+ pid = get_task_pid(current, PIDTYPE_PID);
+ type = PIDTYPE_PID;
+ break;
+ case PIDFD_SELF_THREAD_GROUP:
+ pid = get_task_pid(current, PIDTYPE_TGID);
+ type = PIDTYPE_TGID;
+ break;
+ default: {
+ CLASS(fd, f)(pidfd);
+ if (fd_empty(f))
+ return -EBADF;
- /* Is this a pidfd? */
- pid = pidfd_to_pid(fd_file(f));
- if (IS_ERR(pid))
- return PTR_ERR(pid);
+ /* Is this a pidfd? */
+ pid = pidfd_to_pid(fd_file(f));
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
- if (!access_pidfd_pidns(pid))
- return -EINVAL;
+ if (!access_pidfd_pidns(pid))
+ return -EINVAL;
- switch (flags) {
- case 0:
/* Infer scope from the type of pidfd. */
if (fd_file(f)->f_flags & PIDFD_THREAD)
type = PIDTYPE_PID;
else
type = PIDTYPE_TGID;
- break;
- case PIDFD_SIGNAL_THREAD:
- type = PIDTYPE_PID;
- break;
- case PIDFD_SIGNAL_THREAD_GROUP:
- type = PIDTYPE_TGID;
- break;
- case PIDFD_SIGNAL_PROCESS_GROUP:
- type = PIDTYPE_PGID;
- break;
- }
- if (info) {
- ret = copy_siginfo_from_user_any(&kinfo, info);
- if (unlikely(ret))
- return ret;
-
- if (unlikely(sig != kinfo.si_signo))
- return -EINVAL;
-
- /* Only allow sending arbitrary signals to yourself. */
- if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
- (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
- return -EPERM;
- } else {
- prepare_kill_siginfo(sig, &kinfo, type);
+ return do_pidfd_send_signal(pid, sig, type, info, flags);
+ }
}
- if (type == PIDTYPE_PGID)
- return kill_pgrp_info(sig, &kinfo, pid);
- else
- return kill_pid_info_type(sig, &kinfo, pid, type);
+ return do_pidfd_send_signal(pid, sig, type, info, flags);
}
static int
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4dae6ac2e83f..513b1945987c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -126,6 +126,18 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = {
.lock = INIT_LOCAL_LOCK(softirq_ctrl.lock),
};
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key bh_lock_key;
+struct lockdep_map bh_lock_map = {
+ .name = "local_bh",
+ .key = &bh_lock_key,
+ .wait_type_outer = LD_WAIT_FREE,
+ .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */
+ .lock_type = LD_LOCK_PERCPU,
+};
+EXPORT_SYMBOL_GPL(bh_lock_map);
+#endif
+
/**
* local_bh_blocked() - Check for idle whether BH processing is blocked
*
@@ -148,6 +160,8 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
WARN_ON_ONCE(in_hardirq());
+ lock_map_acquire_read(&bh_lock_map);
+
/* First entry of a task into a BH disabled section? */
if (!current->softirq_disable_cnt) {
if (preemptible()) {
@@ -211,6 +225,8 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
WARN_ON_ONCE(in_hardirq());
lockdep_assert_irqs_enabled();
+ lock_map_release(&bh_lock_map);
+
local_irq_save(flags);
curcnt = __this_cpu_read(softirq_ctrl.cnt);
@@ -261,6 +277,8 @@ static inline void ksoftirqd_run_begin(void)
/* Counterpart to ksoftirqd_run_begin() */
static inline void ksoftirqd_run_end(void)
{
+ /* pairs with the lock_map_acquire_read() in ksoftirqd_run_begin() */
+ lock_map_release(&bh_lock_map);
__local_bh_enable(SOFTIRQ_OFFSET, true);
WARN_ON_ONCE(in_interrupt());
local_irq_enable();
diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c
index bb7d066a7c39..269683d41aa9 100644
--- a/kernel/static_call_inline.c
+++ b/kernel/static_call_inline.c
@@ -206,7 +206,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
continue;
}
- arch_static_call_transform(site_addr, NULL, func,
+ arch_static_call_transform(site_addr, tramp, func,
static_call_is_tail(site));
}
}
@@ -325,13 +325,12 @@ static int __static_call_mod_text_reserved(void *start, void *end)
struct module *mod;
int ret;
- preempt_disable();
- mod = __module_text_address((unsigned long)start);
- WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
- if (!try_module_get(mod))
- mod = NULL;
- preempt_enable();
-
+ scoped_guard(rcu) {
+ mod = __module_text_address((unsigned long)start);
+ WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+ if (!try_module_get(mod))
+ mod = NULL;
+ }
if (!mod)
return 0;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 8896d844d738..5d2d0562115b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -250,6 +250,7 @@ static int multi_cpu_stop(void *data)
* be detected and reported on their side.
*/
touch_nmi_watchdog();
+ /* Also suppress RCU CPU stall warnings. */
rcu_momentary_eqs();
}
} while (curstate != MULTI_STOP_EXIT);
diff --git a/kernel/sys.c b/kernel/sys.c
index cb366ff8703a..c434968e9f5d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1085,6 +1085,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
{
struct task_struct *p;
struct task_struct *group_leader = current->group_leader;
+ struct pid *pids[PIDTYPE_MAX] = { 0 };
struct pid *pgrp;
int err;
@@ -1142,13 +1143,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
goto out;
if (task_pgrp(p) != pgrp)
- change_pid(p, PIDTYPE_PGID, pgrp);
+ change_pid(pids, p, PIDTYPE_PGID, pgrp);
err = 0;
out:
/* All paths lead to here, thus we are safe. -DaveM */
write_unlock_irq(&tasklist_lock);
rcu_read_unlock();
+ free_pids(pids);
return err;
}
@@ -1222,21 +1224,22 @@ out:
return retval;
}
-static void set_special_pids(struct pid *pid)
+static void set_special_pids(struct pid **pids, struct pid *pid)
{
struct task_struct *curr = current->group_leader;
if (task_session(curr) != pid)
- change_pid(curr, PIDTYPE_SID, pid);
+ change_pid(pids, curr, PIDTYPE_SID, pid);
if (task_pgrp(curr) != pid)
- change_pid(curr, PIDTYPE_PGID, pid);
+ change_pid(pids, curr, PIDTYPE_PGID, pid);
}
int ksys_setsid(void)
{
struct task_struct *group_leader = current->group_leader;
struct pid *sid = task_pid(group_leader);
+ struct pid *pids[PIDTYPE_MAX] = { 0 };
pid_t session = pid_vnr(sid);
int err = -EPERM;
@@ -1252,13 +1255,14 @@ int ksys_setsid(void)
goto out;
group_leader->signal->leader = 1;
- set_special_pids(sid);
+ set_special_pids(pids, sid);
proc_clear_tty(group_leader);
err = session;
out:
write_unlock_irq(&tasklist_lock);
+ free_pids(pids);
if (err > 0) {
proc_sid_connector(group_leader);
sched_autogroup_create_attach(group_leader);
@@ -2811,6 +2815,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = arch_lock_shadow_stack_status(me, arg2);
break;
+ case PR_TIMER_CREATE_RESTORE_IDS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = posixtimer_create_prctl(arg2);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cb57da499ebb..3b7a7308e35b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -20,9 +20,6 @@
*/
#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/bitmap.h>
#include <linux/signal.h>
@@ -31,7 +28,6 @@
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/ctype.h>
-#include <linux/kmemleak.h>
#include <linux/filter.h>
#include <linux/fs.h>
#include <linux/init.h>
@@ -42,26 +38,20 @@
#include <linux/highuid.h>
#include <linux/writeback.h>
#include <linux/ratelimit.h>
-#include <linux/hugetlb.h>
#include <linux/initrd.h>
#include <linux/key.h>
#include <linux/times.h>
#include <linux/limits.h>
-#include <linux/dcache.h>
#include <linux/syscalls.h>
-#include <linux/vmstat.h>
#include <linux/nfs_fs.h>
#include <linux/acpi.h>
#include <linux/reboot.h>
#include <linux/ftrace.h>
-#include <linux/perf_event.h>
-#include <linux/oom.h>
#include <linux/kmod.h>
#include <linux/capability.h>
#include <linux/binfmts.h>
#include <linux/sched/sysctl.h>
#include <linux/mount.h>
-#include <linux/userfaultfd_k.h>
#include <linux/pid.h>
#include "../lib/kstrtox.h"
@@ -91,12 +81,6 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
#if defined(CONFIG_SYSCTL)
/* Constants used for minimum and maximum */
-
-#ifdef CONFIG_PERF_EVENTS
-static const int six_hundred_forty_kb = 640 * 1024;
-#endif
-
-
static const int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
@@ -129,12 +113,6 @@ enum sysctl_writes_mode {
static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
#endif /* CONFIG_PROC_SYSCTL */
-
-#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
- defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
-int sysctl_legacy_va_layout;
-#endif
-
#endif /* CONFIG_SYSCTL */
/*
@@ -1794,15 +1772,6 @@ static const struct ctl_table kern_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_MAXOLDUID,
},
-#ifdef CONFIG_S390
- {
- .procname = "userprocess_debug",
- .data = &show_unhandled_signals,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
{
.procname = "panic_on_oops",
.data = &panic_on_oops,
@@ -1831,16 +1800,6 @@ static const struct ctl_table kern_table[] = {
.mode = 0444,
.proc_handler = proc_dointvec,
},
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
- {
- .procname = "unknown_nmi_panic",
- .data = &unknown_nmi_panic,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-
#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \
defined(CONFIG_DEBUG_STACKOVERFLOW)
{
@@ -1851,43 +1810,6 @@ static const struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#if defined(CONFIG_X86)
- {
- .procname = "panic_on_unrecovered_nmi",
- .data = &panic_on_unrecovered_nmi,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "panic_on_io_nmi",
- .data = &panic_on_io_nmi,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "bootloader_type",
- .data = &bootloader_type,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "bootloader_version",
- .data = &bootloader_version,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "io_delay_type",
- .data = &io_delay_type,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
#if defined(CONFIG_MMU)
{
.procname = "randomize_va_space",
@@ -1897,24 +1819,6 @@ static const struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#if defined(CONFIG_S390) && defined(CONFIG_SMP)
- {
- .procname = "spin_retry",
- .data = &spin_retry,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
- {
- .procname = "acpi_video_flags",
- .data = &acpi_realmode_flags,
- .maxlen = sizeof (unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
-#endif
#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
{
.procname = "ignore-unaligned-usertrap",
@@ -1933,63 +1837,6 @@ static const struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_PERF_EVENTS
- /*
- * User-space scripts rely on the existence of this file
- * as a feature check for perf_events being enabled.
- *
- * So it's an ABI, do not remove!
- */
- {
- .procname = "perf_event_paranoid",
- .data = &sysctl_perf_event_paranoid,
- .maxlen = sizeof(sysctl_perf_event_paranoid),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "perf_event_mlock_kb",
- .data = &sysctl_perf_event_mlock,
- .maxlen = sizeof(sysctl_perf_event_mlock),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "perf_event_max_sample_rate",
- .data = &sysctl_perf_event_sample_rate,
- .maxlen = sizeof(sysctl_perf_event_sample_rate),
- .mode = 0644,
- .proc_handler = perf_event_max_sample_rate_handler,
- .extra1 = SYSCTL_ONE,
- },
- {
- .procname = "perf_cpu_time_max_percent",
- .data = &sysctl_perf_cpu_time_max_percent,
- .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
- .mode = 0644,
- .proc_handler = perf_cpu_time_max_percent_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
- {
- .procname = "perf_event_max_stack",
- .data = &sysctl_perf_event_max_stack,
- .maxlen = sizeof(sysctl_perf_event_max_stack),
- .mode = 0644,
- .proc_handler = perf_event_max_stack_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = (void *)&six_hundred_forty_kb,
- },
- {
- .procname = "perf_event_max_contexts_per_stack",
- .data = &sysctl_perf_event_max_contexts_per_stack,
- .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack),
- .mode = 0644,
- .proc_handler = perf_event_max_stack_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_THOUSAND,
- },
-#endif
{
.procname = "panic_on_warn",
.data = &panic_on_warn,
@@ -2021,215 +1868,9 @@ static const struct ctl_table kern_table[] = {
#endif
};
-static const struct ctl_table vm_table[] = {
- {
- .procname = "overcommit_memory",
- .data = &sysctl_overcommit_memory,
- .maxlen = sizeof(sysctl_overcommit_memory),
- .mode = 0644,
- .proc_handler = overcommit_policy_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_TWO,
- },
- {
- .procname = "overcommit_ratio",
- .data = &sysctl_overcommit_ratio,
- .maxlen = sizeof(sysctl_overcommit_ratio),
- .mode = 0644,
- .proc_handler = overcommit_ratio_handler,
- },
- {
- .procname = "overcommit_kbytes",
- .data = &sysctl_overcommit_kbytes,
- .maxlen = sizeof(sysctl_overcommit_kbytes),
- .mode = 0644,
- .proc_handler = overcommit_kbytes_handler,
- },
- {
- .procname = "page-cluster",
- .data = &page_cluster,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = (void *)&page_cluster_max,
- },
- {
- .procname = "dirtytime_expire_seconds",
- .data = &dirtytime_expire_interval,
- .maxlen = sizeof(dirtytime_expire_interval),
- .mode = 0644,
- .proc_handler = dirtytime_interval_handler,
- .extra1 = SYSCTL_ZERO,
- },
- {
- .procname = "swappiness",
- .data = &vm_swappiness,
- .maxlen = sizeof(vm_swappiness),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_TWO_HUNDRED,
- },
-#ifdef CONFIG_NUMA
- {
- .procname = "numa_stat",
- .data = &sysctl_vm_numa_stat,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = sysctl_vm_numa_stat_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
- {
- .procname = "drop_caches",
- .data = &sysctl_drop_caches,
- .maxlen = sizeof(int),
- .mode = 0200,
- .proc_handler = drop_caches_sysctl_handler,
- .extra1 = SYSCTL_ONE,
- .extra2 = SYSCTL_FOUR,
- },
- {
- .procname = "page_lock_unfairness",
- .data = &sysctl_page_lock_unfairness,
- .maxlen = sizeof(sysctl_page_lock_unfairness),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
-#ifdef CONFIG_MMU
- {
- .procname = "max_map_count",
- .data = &sysctl_max_map_count,
- .maxlen = sizeof(sysctl_max_map_count),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
-#else
- {
- .procname = "nr_trim_pages",
- .data = &sysctl_nr_trim_pages,
- .maxlen = sizeof(sysctl_nr_trim_pages),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
-#endif
- {
- .procname = "vfs_cache_pressure",
- .data = &sysctl_vfs_cache_pressure,
- .maxlen = sizeof(sysctl_vfs_cache_pressure),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
-#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
- defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
- {
- .procname = "legacy_va_layout",
- .data = &sysctl_legacy_va_layout,
- .maxlen = sizeof(sysctl_legacy_va_layout),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
-#endif
-#ifdef CONFIG_NUMA
- {
- .procname = "zone_reclaim_mode",
- .data = &node_reclaim_mode,
- .maxlen = sizeof(node_reclaim_mode),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- },
-#endif
-#ifdef CONFIG_SMP
- {
- .procname = "stat_interval",
- .data = &sysctl_stat_interval,
- .maxlen = sizeof(sysctl_stat_interval),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "stat_refresh",
- .data = NULL,
- .maxlen = 0,
- .mode = 0600,
- .proc_handler = vmstat_refresh,
- },
-#endif
-#ifdef CONFIG_MMU
- {
- .procname = "mmap_min_addr",
- .data = &dac_mmap_min_addr,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = mmap_min_addr_handler,
- },
-#endif
-#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \
- (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
- {
- .procname = "vdso_enabled",
-#ifdef CONFIG_X86_32
- .data = &vdso32_enabled,
- .maxlen = sizeof(vdso32_enabled),
-#else
- .data = &vdso_enabled,
- .maxlen = sizeof(vdso_enabled),
-#endif
- .mode = 0644,
- .proc_handler = proc_dointvec,
- .extra1 = SYSCTL_ZERO,
- },
-#endif
- {
- .procname = "user_reserve_kbytes",
- .data = &sysctl_user_reserve_kbytes,
- .maxlen = sizeof(sysctl_user_reserve_kbytes),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
- {
- .procname = "admin_reserve_kbytes",
- .data = &sysctl_admin_reserve_kbytes,
- .maxlen = sizeof(sysctl_admin_reserve_kbytes),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
-#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
- {
- .procname = "mmap_rnd_bits",
- .data = &mmap_rnd_bits,
- .maxlen = sizeof(mmap_rnd_bits),
- .mode = 0600,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = (void *)&mmap_rnd_bits_min,
- .extra2 = (void *)&mmap_rnd_bits_max,
- },
-#endif
-#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
- {
- .procname = "mmap_rnd_compat_bits",
- .data = &mmap_rnd_compat_bits,
- .maxlen = sizeof(mmap_rnd_compat_bits),
- .mode = 0600,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = (void *)&mmap_rnd_compat_bits_min,
- .extra2 = (void *)&mmap_rnd_compat_bits_max,
- },
-#endif
-};
-
int __init sysctl_init_bases(void)
{
register_sysctl_init("kernel", kern_table);
- register_sysctl_init("vm", vm_table);
return 0;
}
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index fe0ae82124fe..e6e9b85d4db5 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
+
+# Branch profiling isn't noinstr-safe
+ifdef CONFIG_TRACE_BRANCH_PROFILING
+CFLAGS_sched_clock.o += -DDISABLE_BRANCH_PROFILING
+endif
+
obj-y += time.o timer.o hrtimer.o sleep_timeout.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-y += timeconv.o timecounter.o alarmtimer.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2a7802ec480c..e0eeacbe2521 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -1510,7 +1510,7 @@ static int __init boot_override_clocksource(char* str)
{
mutex_lock(&clocksource_mutex);
if (str)
- strscpy(override_name, str, sizeof(override_name));
+ strscpy(override_name, str);
mutex_unlock(&clocksource_mutex);
return 1;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index deb1aa32814e..22376a1a75b9 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -117,16 +117,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.csd = CSD_INIT(retrigger_next_event, NULL)
};
-static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
- /* Make sure we catch unsupported clockids */
- [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
-
- [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
- [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
- [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
- [CLOCK_TAI] = HRTIMER_BASE_TAI,
-};
-
static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
{
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
@@ -1587,19 +1577,19 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
- if (likely(clock_id < MAX_CLOCKS)) {
- int base = hrtimer_clock_to_base_table[clock_id];
-
- if (likely(base != HRTIMER_MAX_CLOCK_BASES))
- return base;
+ switch (clock_id) {
+ case CLOCK_REALTIME:
+ return HRTIMER_BASE_REALTIME;
+ case CLOCK_MONOTONIC:
+ return HRTIMER_BASE_MONOTONIC;
+ case CLOCK_BOOTTIME:
+ return HRTIMER_BASE_BOOTTIME;
+ case CLOCK_TAI:
+ return HRTIMER_BASE_TAI;
+ default:
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return HRTIMER_BASE_MONOTONIC;
}
- WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
- return HRTIMER_BASE_MONOTONIC;
-}
-
-static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused)
-{
- return HRTIMER_NORESTART;
}
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 0775b9ec952a..e3642278df43 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -165,26 +165,26 @@ static struct timens_offset offset_from_ts(struct timespec64 off)
* HVCLOCK
* VVAR
*
- * The check for vdso_data->clock_mode is in the unlikely path of
+ * The check for vdso_clock->clock_mode is in the unlikely path of
* the seq begin magic. So for the non-timens case most of the time
* 'seq' is even, so the branch is not taken.
*
* If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
- * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
+ * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
* update to finish and for 'seq' to become even anyway.
*
- * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which
+ * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
* enforces the time namespace handling path.
*/
-static void timens_setup_vdso_data(struct vdso_data *vdata,
- struct time_namespace *ns)
+static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
+ struct time_namespace *ns)
{
- struct timens_offset *offset = vdata->offset;
+ struct timens_offset *offset = vc->offset;
struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
- vdata->seq = 1;
- vdata->clock_mode = VDSO_CLOCKMODE_TIMENS;
+ vc->seq = 1;
+ vc->clock_mode = VDSO_CLOCKMODE_TIMENS;
offset[CLOCK_MONOTONIC] = monotonic;
offset[CLOCK_MONOTONIC_RAW] = monotonic;
offset[CLOCK_MONOTONIC_COARSE] = monotonic;
@@ -219,7 +219,8 @@ static DEFINE_MUTEX(offset_lock);
static void timens_set_vvar_page(struct task_struct *task,
struct time_namespace *ns)
{
- struct vdso_data *vdata;
+ struct vdso_time_data *vdata;
+ struct vdso_clock *vc;
unsigned int i;
if (ns == &init_time_ns)
@@ -235,10 +236,11 @@ static void timens_set_vvar_page(struct task_struct *task,
goto out;
ns->frozen_offsets = true;
- vdata = arch_get_vdso_data(page_address(ns->vvar_page));
+ vdata = page_address(ns->vvar_page);
+ vc = vdata->clock_data;
for (i = 0; i < CS_BASES; i++)
- timens_setup_vdso_data(&vdata[i], ns);
+ timens_setup_vdso_clock_data(&vc[i], ns);
out:
mutex_unlock(&offset_lock);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 163e7a2033b6..b837d3d9d325 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -678,8 +678,7 @@ void ntp_notify_cmos_timer(bool offset_set)
static void __init ntp_init_cmos_sync(void)
{
- hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
- sync_hrtimer.function = sync_timer_callback;
+ hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS);
}
#else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */
static inline void __init ntp_init_cmos_sync(void) { }
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 1af0bb2cc45c..101a0f7c43e0 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp,
return err;
}
-#ifdef CONFIG_COMPAT
-static long posix_clock_compat_ioctl(struct file *fp,
- unsigned int cmd, unsigned long arg)
-{
- struct posix_clock_context *pccontext = fp->private_data;
- struct posix_clock *clk = get_posix_clock(fp);
- int err = -ENOTTY;
-
- if (!clk)
- return -ENODEV;
-
- if (clk->ops.ioctl)
- err = clk->ops.ioctl(pccontext, cmd, arg);
-
- put_posix_clock(clk);
-
- return err;
-}
-#endif
-
static int posix_clock_open(struct inode *inode, struct file *fp)
{
int err;
@@ -129,6 +109,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
goto out;
}
pccontext->clk = clk;
+ pccontext->fp = fp;
if (clk->ops.open) {
err = clk->ops.open(pccontext, fp->f_mode);
if (err) {
@@ -171,11 +152,9 @@ static const struct file_operations posix_clock_file_operations = {
.read = posix_clock_read,
.poll = posix_clock_poll,
.unlocked_ioctl = posix_clock_ioctl,
+ .compat_ioctl = posix_clock_ioctl,
.open = posix_clock_open,
.release = posix_clock_release,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = posix_clock_compat_ioctl,
-#endif
};
int posix_clock_register(struct posix_clock *clk, struct device *dev)
@@ -251,7 +230,7 @@ static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx)
if (err)
return err;
- if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+ if (tx->modes && (cd.fp->f_mode & FMODE_WRITE) == 0) {
err = -EACCES;
goto out;
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 1b675aee99a9..6222112533a7 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -9,28 +9,23 @@
*
* These are all the functions necessary to implement POSIX clocks & timers
*/
-#include <linux/mm.h>
+#include <linux/compat.h>
+#include <linux/compiler.h>
+#include <linux/init.h>
+#include <linux/jhash.h>
#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/mutex.h>
-#include <linux/sched/task.h>
-
-#include <linux/uaccess.h>
#include <linux/list.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
-#include <linux/hash.h>
+#include <linux/memblock.h>
+#include <linux/nospec.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
+#include <linux/prctl.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
#include <linux/syscalls.h>
-#include <linux/wait.h>
-#include <linux/workqueue.h>
-#include <linux/export.h>
-#include <linux/hashtable.h>
-#include <linux/compat.h>
-#include <linux/nospec.h>
+#include <linux/time.h>
#include <linux/time_namespace.h>
+#include <linux/uaccess.h>
#include "timekeeping.h"
#include "posix-timers.h"
@@ -46,39 +41,65 @@ static struct kmem_cache *posix_timers_cache;
* This allows checkpoint/restore to reconstruct the exact timer IDs for
* a process.
*/
-static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
-static DEFINE_SPINLOCK(hash_lock);
+struct timer_hash_bucket {
+ spinlock_t lock;
+ struct hlist_head head;
+};
+
+static struct {
+ struct timer_hash_bucket *buckets;
+ unsigned long mask;
+} __timer_data __ro_after_init __aligned(2*sizeof(long));
+
+#define timer_buckets (__timer_data.buckets)
+#define timer_hashmask (__timer_data.mask)
static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;
+#define TIMER_ANY_ID INT_MIN
+
/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif
-static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+static struct k_itimer *__lock_timer(timer_t timer_id);
-#define lock_timer(tid, flags) \
-({ struct k_itimer *__timr; \
- __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
- __timr; \
+#define lock_timer(tid) \
+({ struct k_itimer *__timr; \
+ __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \
+ __timr; \
})
-static int hash(struct signal_struct *sig, unsigned int nr)
+static inline void unlock_timer(struct k_itimer *timr)
{
- return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
+ if (likely((timr)))
+ spin_unlock_irq(&timr->it_lock);
}
-static struct k_itimer *__posix_timers_find(struct hlist_head *head,
- struct signal_struct *sig,
- timer_t id)
+#define scoped_timer_get_or_fail(_id) \
+ scoped_cond_guard(lock_timer, return -EINVAL, _id)
+
+#define scoped_timer (scope)
+
+DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id);
+DEFINE_CLASS_IS_COND_GUARD(lock_timer);
+
+static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr)
{
+ return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask];
+}
+
+static struct k_itimer *posix_timer_by_id(timer_t id)
+{
+ struct signal_struct *sig = current->signal;
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
struct k_itimer *timer;
- hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) {
+ hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) {
/* timer->it_signal can be set concurrently */
if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id))
return timer;
@@ -86,46 +107,88 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head,
return NULL;
}
-static struct k_itimer *posix_timer_by_id(timer_t id)
+static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer)
{
- struct signal_struct *sig = current->signal;
- struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+ unsigned long val = (unsigned long)timer->it_signal;
- return __posix_timers_find(head, sig, id);
+ /*
+ * Mask out bit 0, which acts as invalid marker to prevent
+ * posix_timer_by_id() detecting it as valid.
+ */
+ return (struct signal_struct *)(val & ~1UL);
}
-static int posix_timer_add(struct k_itimer *timer)
+static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig,
+ timer_t id)
{
- struct signal_struct *sig = current->signal;
- struct hlist_head *head;
- unsigned int cnt, id;
+ struct hlist_head *head = &bucket->head;
+ struct k_itimer *timer;
- /*
- * FIXME: Replace this by a per signal struct xarray once there is
- * a plan to handle the resulting CRIU regression gracefully.
- */
- for (cnt = 0; cnt <= INT_MAX; cnt++) {
- spin_lock(&hash_lock);
- id = sig->next_posix_timer_id;
+ hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) {
+ if ((posix_sig_owner(timer) == sig) && (timer->it_id == id))
+ return true;
+ }
+ return false;
+}
- /* Write the next ID back. Clamp it to the positive space */
- sig->next_posix_timer_id = (id + 1) & INT_MAX;
+static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id)
+{
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
- head = &posix_timers_hashtable[hash(sig, id)];
- if (!__posix_timers_find(head, sig, id)) {
- hlist_add_head_rcu(&timer->t_hash, head);
- spin_unlock(&hash_lock);
- return id;
+ scoped_guard (spinlock, &bucket->lock) {
+ /*
+ * Validate under the lock as this could have raced against
+ * another thread ending up with the same ID, which is
+ * highly unlikely, but possible.
+ */
+ if (!posix_timer_hashed(bucket, sig, id)) {
+ /*
+ * Set the timer ID and the signal pointer to make
+ * it identifiable in the hash table. The signal
+ * pointer has bit 0 set to indicate that it is not
+ * yet fully initialized. posix_timer_hashed()
+ * masks this bit out, but the syscall lookup fails
+ * to match due to it being set. This guarantees
+ * that there can't be duplicate timer IDs handed
+ * out.
+ */
+ timer->it_id = (timer_t)id;
+ timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
+ hlist_add_head_rcu(&timer->t_hash, &bucket->head);
+ return true;
}
- spin_unlock(&hash_lock);
}
- /* POSIX return code when no timer ID could be allocated */
- return -EAGAIN;
+ return false;
}
-static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
+static int posix_timer_add(struct k_itimer *timer, int req_id)
{
- spin_unlock_irqrestore(&timr->it_lock, flags);
+ struct signal_struct *sig = current->signal;
+
+ if (unlikely(req_id != TIMER_ANY_ID)) {
+ if (!posix_timer_add_at(timer, sig, req_id))
+ return -EBUSY;
+
+ /*
+ * Move the ID counter past the requested ID, so that after
+ * switching back to normal mode the IDs are outside of the
+ * exact allocated region. That avoids ID collisions on the
+ * next regular timer_create() invocations.
+ */
+ atomic_set(&sig->next_posix_timer_id, req_id + 1);
+ return req_id;
+ }
+
+ for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) {
+ /* Get the next timer ID and clamp it to positive space */
+ unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX;
+
+ if (posix_timer_add_at(timer, sig, id))
+ return id;
+ cond_resched();
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
@@ -222,9 +285,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
static __init int init_posix_timers(void)
{
- posix_timers_cache = kmem_cache_create("posix_timers_cache",
- sizeof(struct k_itimer), 0,
- SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer),
+ __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL);
return 0;
}
__initcall(init_posix_timers);
@@ -259,7 +321,7 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it
* since the signal was queued. In either case, don't rearm and
* drop the signal.
*/
- if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal))
+ if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr)))
return false;
if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING))
@@ -304,6 +366,9 @@ void posix_timer_queue_signal(struct k_itimer *timr)
{
lockdep_assert_held(&timr->it_lock);
+ if (!posixtimer_valid(timr))
+ return;
+
timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED;
posixtimer_send_sigqueue(timr);
}
@@ -324,6 +389,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+long posixtimer_create_prctl(unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_TIMER_CREATE_RESTORE_IDS_OFF:
+ current->signal->timer_create_restore_ids = 0;
+ return 0;
+ case PR_TIMER_CREATE_RESTORE_IDS_ON:
+ current->signal->timer_create_restore_ids = 1;
+ return 0;
+ case PR_TIMER_CREATE_RESTORE_IDS_GET:
+ return current->signal->timer_create_restore_ids;
+ }
+ return -EINVAL;
+}
+
static struct pid *good_sigevent(sigevent_t * event)
{
struct pid *pid = task_tgid(current);
@@ -350,8 +430,12 @@ static struct pid *good_sigevent(sigevent_t * event)
static struct k_itimer *alloc_posix_timer(void)
{
- struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
+ struct k_itimer *tmr;
+ if (unlikely(!posix_timers_cache))
+ return NULL;
+
+ tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
if (!tmr)
return tmr;
@@ -373,15 +457,16 @@ void posixtimer_free_timer(struct k_itimer *tmr)
static void posix_timer_unhash_and_free(struct k_itimer *tmr)
{
- spin_lock(&hash_lock);
- hlist_del_rcu(&tmr->t_hash);
- spin_unlock(&hash_lock);
+ struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id);
+
+ scoped_guard (spinlock, &bucket->lock)
+ hlist_del_rcu(&tmr->t_hash);
posixtimer_putref(tmr);
}
static int common_timer_create(struct k_itimer *new_timer)
{
- hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+ hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0);
return 0;
}
@@ -390,6 +475,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
timer_t __user *created_timer_id)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
+ timer_t req_id = TIMER_ANY_ID;
struct k_itimer *new_timer;
int error, new_timer_id;
@@ -404,26 +490,32 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
spin_lock_init(&new_timer->it_lock);
+ /* Special case for CRIU to restore timers with a given timer ID. */
+ if (unlikely(current->signal->timer_create_restore_ids)) {
+ if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
+ return -EFAULT;
+ /* Valid IDs are 0..INT_MAX */
+ if ((unsigned int)req_id > INT_MAX)
+ return -EINVAL;
+ }
+
/*
* Add the timer to the hash table. The timer is not yet valid
- * because new_timer::it_signal is still NULL. The timer id is also
- * not yet visible to user space.
+ * after insertion, but has a unique ID allocated.
*/
- new_timer_id = posix_timer_add(new_timer);
+ new_timer_id = posix_timer_add(new_timer, req_id);
if (new_timer_id < 0) {
posixtimer_free_timer(new_timer);
return new_timer_id;
}
- new_timer->it_id = (timer_t) new_timer_id;
new_timer->it_clock = which_clock;
new_timer->kclock = kc;
new_timer->it_overrun = -1LL;
if (event) {
- rcu_read_lock();
- new_timer->it_pid = get_pid(good_sigevent(event));
- rcu_read_unlock();
+ scoped_guard (rcu)
+ new_timer->it_pid = get_pid(good_sigevent(event));
if (!new_timer->it_pid) {
error = -EINVAL;
goto out;
@@ -434,7 +526,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
} else {
new_timer->it_sigev_notify = SIGEV_SIGNAL;
new_timer->sigq.info.si_signo = SIGALRM;
- memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t));
new_timer->sigq.info.si_value.sival_int = new_timer->it_id;
new_timer->it_pid = get_pid(task_tgid(current));
}
@@ -453,7 +544,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
}
/*
* After succesful copy out, the timer ID is visible to user space
- * now but not yet valid because new_timer::signal is still NULL.
+ * now but not yet valid because new_timer::signal low order bit is 1.
*
* Complete the initialization with the clock specific create
* callback.
@@ -462,14 +553,25 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
if (error)
goto out;
- spin_lock_irq(&current->sighand->siglock);
- /* This makes the timer valid in the hash table */
- WRITE_ONCE(new_timer->it_signal, current->signal);
- hlist_add_head(&new_timer->list, &current->signal->posix_timers);
- spin_unlock_irq(&current->sighand->siglock);
/*
- * After unlocking sighand::siglock @new_timer is subject to
- * concurrent removal and cannot be touched anymore
+ * timer::it_lock ensures that __lock_timer() observes a fully
+ * initialized timer when it observes a valid timer::it_signal.
+ *
+ * sighand::siglock is required to protect signal::posix_timers.
+ */
+ scoped_guard (spinlock_irq, &new_timer->it_lock) {
+ guard(spinlock)(&current->sighand->siglock);
+ /*
+ * new_timer::it_signal contains the signal pointer with
+ * bit 0 set, which makes it invalid for syscall operations.
+ * Store the unmodified signal pointer to make it valid.
+ */
+ WRITE_ONCE(new_timer->it_signal, current->signal);
+ hlist_add_head_rcu(&new_timer->list, &current->signal->posix_timers);
+ }
+ /*
+ * After unlocking @new_timer is subject to concurrent removal and
+ * cannot be touched anymore
*/
return 0;
out:
@@ -507,7 +609,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
}
#endif
-static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *__lock_timer(timer_t timer_id)
{
struct k_itimer *timr;
@@ -522,11 +624,11 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
* The hash lookup and the timers are RCU protected.
*
* Timers are added to the hash in invalid state where
- * timr::it_signal == NULL. timer::it_signal is only set after the
- * rest of the initialization succeeded.
+ * timr::it_signal is marked invalid. timer::it_signal is only set
+ * after the rest of the initialization succeeded.
*
* Timer destruction happens in steps:
- * 1) Set timr::it_signal to NULL with timr::it_lock held
+ * 1) Set timr::it_signal marked invalid with timr::it_lock held
* 2) Release timr::it_lock
* 3) Remove from the hash under hash_lock
* 4) Put the reference count.
@@ -543,25 +645,21 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
*
* The lookup validates locklessly that timr::it_signal ==
* current::it_signal and timr::it_id == @timer_id. timr::it_id
- * can't change, but timr::it_signal becomes NULL during
- * destruction.
+ * can't change, but timr::it_signal can become invalid during
+ * destruction, which makes the locked check fail.
*/
- rcu_read_lock();
+ guard(rcu)();
timr = posix_timer_by_id(timer_id);
if (timr) {
- spin_lock_irqsave(&timr->it_lock, *flags);
+ spin_lock_irq(&timr->it_lock);
/*
* Validate under timr::it_lock that timr::it_signal is
* still valid. Pairs with #1 above.
*/
- if (timr->it_signal == current->signal) {
- rcu_read_unlock();
+ if (timr->it_signal == current->signal)
return timr;
- }
- spin_unlock_irqrestore(&timr->it_lock, *flags);
+ spin_unlock_irq(&timr->it_lock);
}
- rcu_read_unlock();
-
return NULL;
}
@@ -652,24 +750,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
{
- const struct k_clock *kc;
- struct k_itimer *timr;
- unsigned long flags;
- int ret = 0;
-
- timr = lock_timer(timer_id, &flags);
- if (!timr)
- return -EINVAL;
-
memset(setting, 0, sizeof(*setting));
- kc = timr->kclock;
- if (WARN_ON_ONCE(!kc || !kc->timer_get))
- ret = -EINVAL;
- else
- kc->timer_get(timr, setting);
-
- unlock_timer(timr, flags);
- return ret;
+ scoped_timer_get_or_fail(timer_id)
+ scoped_timer->kclock->timer_get(scoped_timer, setting);
+ return 0;
}
/* Get the time remaining on a POSIX.1b interval timer. */
@@ -723,18 +807,8 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
*/
SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
{
- struct k_itimer *timr;
- unsigned long flags;
- int overrun;
-
- timr = lock_timer(timer_id, &flags);
- if (!timr)
- return -EINVAL;
-
- overrun = timer_overrun_to_int(timr);
- unlock_timer(timr, flags);
-
- return overrun;
+ scoped_timer_get_or_fail(timer_id)
+ return timer_overrun_to_int(scoped_timer);
}
static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
@@ -747,7 +821,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
/*
* Posix magic: Relative CLOCK_REALTIME timers are not affected by
* clock modifications, so they become CLOCK_MONOTONIC based under the
- * hood. See hrtimer_init(). Update timr->kclock, so the generic
+ * hood. See hrtimer_setup(). Update timr->kclock, so the generic
* functions which use timr->kclock->clock_get_*() work.
*
* Note: it_clock stays unmodified, because the next timer_set() might
@@ -756,8 +830,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
if (timr->it_clock == CLOCK_REALTIME)
timr->kclock = absolute ? &clock_realtime : &clock_monotonic;
- hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
- timr->it.real.timer.function = posix_timer_fn;
+ hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode);
if (!absolute)
expires = ktime_add_safe(expires, timer->base->get_time());
@@ -791,26 +864,13 @@ static void common_timer_wait_running(struct k_itimer *timer)
* when the task which tries to delete or disarm the timer has preempted
* the task which runs the expiry in task work context.
*/
-static struct k_itimer *timer_wait_running(struct k_itimer *timer,
- unsigned long *flags)
+static void timer_wait_running(struct k_itimer *timer)
{
- const struct k_clock *kc = READ_ONCE(timer->kclock);
- timer_t timer_id = READ_ONCE(timer->it_id);
-
- /* Prevent kfree(timer) after dropping the lock */
- rcu_read_lock();
- unlock_timer(timer, *flags);
-
/*
* kc->timer_wait_running() might drop RCU lock. So @timer
* cannot be touched anymore after the function returns!
*/
- if (!WARN_ON_ONCE(!kc->timer_wait_running))
- kc->timer_wait_running(timer);
-
- rcu_read_unlock();
- /* Relock the timer. It might be not longer hashed. */
- return lock_timer(timer_id, flags);
+ timer->kclock->timer_wait_running(timer);
}
/*
@@ -865,15 +925,9 @@ int common_timer_set(struct k_itimer *timr, int flags,
return 0;
}
-static int do_timer_settime(timer_t timer_id, int tmr_flags,
- struct itimerspec64 *new_spec64,
+static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64,
struct itimerspec64 *old_spec64)
{
- const struct k_clock *kc;
- struct k_itimer *timr;
- unsigned long flags;
- int error;
-
if (!timespec64_valid(&new_spec64->it_interval) ||
!timespec64_valid(&new_spec64->it_value))
return -EINVAL;
@@ -881,33 +935,28 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags,
if (old_spec64)
memset(old_spec64, 0, sizeof(*old_spec64));
- timr = lock_timer(timer_id, &flags);
-retry:
- if (!timr)
- return -EINVAL;
+ for (; ; old_spec64 = NULL) {
+ struct k_itimer *timr;
- if (old_spec64)
- old_spec64->it_interval = ktime_to_timespec64(timr->it_interval);
+ scoped_timer_get_or_fail(timer_id) {
+ timr = scoped_timer;
- /* Prevent signal delivery and rearming. */
- timr->it_signal_seq++;
+ if (old_spec64)
+ old_spec64->it_interval = ktime_to_timespec64(timr->it_interval);
- kc = timr->kclock;
- if (WARN_ON_ONCE(!kc || !kc->timer_set))
- error = -EINVAL;
- else
- error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64);
-
- if (error == TIMER_RETRY) {
- // We already got the old time...
- old_spec64 = NULL;
- /* Unlocks and relocks the timer if it still exists */
- timr = timer_wait_running(timr, &flags);
- goto retry;
- }
- unlock_timer(timr, flags);
+ /* Prevent signal delivery and rearming. */
+ timr->it_signal_seq++;
- return error;
+ int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64);
+ if (ret != TIMER_RETRY)
+ return ret;
+
+ /* Protect the timer from being freed when leaving the lock scope */
+ rcu_read_lock();
+ }
+ timer_wait_running(timr);
+ rcu_read_unlock();
+ }
}
/* Set a POSIX.1b interval timer */
@@ -978,110 +1027,58 @@ static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr)
}
}
-static inline int timer_delete_hook(struct k_itimer *timer)
+static void posix_timer_delete(struct k_itimer *timer)
{
- const struct k_clock *kc = timer->kclock;
-
- /* Prevent signal delivery and rearming. */
+ /*
+ * Invalidate the timer, remove it from the linked list and remove
+ * it from the ignored list if pending.
+ *
+ * The invalidation must be written with siglock held so that the
+ * signal code observes the invalidated timer::it_signal in
+ * do_sigaction(), which prevents it from moving a pending signal
+ * of a deleted timer to the ignore list.
+ *
+ * The invalidation also prevents signal queueing, signal delivery
+ * and therefore rearming from the signal delivery path.
+ *
+ * A concurrent lookup can still find the timer in the hash, but it
+ * will check timer::it_signal with timer::it_lock held and observe
+ * bit 0 set, which invalidates it. That also prevents the timer ID
+ * from being handed out before this timer is completely gone.
+ */
timer->it_signal_seq++;
- if (WARN_ON_ONCE(!kc || !kc->timer_del))
- return -EINVAL;
- return kc->timer_del(timer);
+ scoped_guard (spinlock, &current->sighand->siglock) {
+ unsigned long sig = (unsigned long)timer->it_signal | 1UL;
+
+ WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig);
+ hlist_del_rcu(&timer->list);
+ posix_timer_cleanup_ignored(timer);
+ }
+
+ while (timer->kclock->timer_del(timer) == TIMER_RETRY) {
+ guard(rcu)();
+ spin_unlock_irq(&timer->it_lock);
+ timer_wait_running(timer);
+ spin_lock_irq(&timer->it_lock);
+ }
}
/* Delete a POSIX.1b interval timer. */
SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
{
struct k_itimer *timer;
- unsigned long flags;
- timer = lock_timer(timer_id, &flags);
-
-retry_delete:
- if (!timer)
- return -EINVAL;
-
- if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) {
- /* Unlocks and relocks the timer if it still exists */
- timer = timer_wait_running(timer, &flags);
- goto retry_delete;
+ scoped_timer_get_or_fail(timer_id) {
+ timer = scoped_timer;
+ posix_timer_delete(timer);
}
-
- spin_lock(&current->sighand->siglock);
- hlist_del(&timer->list);
- posix_timer_cleanup_ignored(timer);
- /*
- * A concurrent lookup could check timer::it_signal lockless. It
- * will reevaluate with timer::it_lock held and observe the NULL.
- *
- * It must be written with siglock held so that the signal code
- * observes timer->it_signal == NULL in do_sigaction(SIG_IGN),
- * which prevents it from moving a pending signal of a deleted
- * timer to the ignore list.
- */
- WRITE_ONCE(timer->it_signal, NULL);
- spin_unlock(&current->sighand->siglock);
-
- unlock_timer(timer, flags);
+ /* Remove it from the hash, which frees up the timer ID */
posix_timer_unhash_and_free(timer);
return 0;
}
/*
- * Delete a timer if it is armed, remove it from the hash and schedule it
- * for RCU freeing.
- */
-static void itimer_delete(struct k_itimer *timer)
-{
- unsigned long flags;
-
- /*
- * irqsave is required to make timer_wait_running() work.
- */
- spin_lock_irqsave(&timer->it_lock, flags);
-
-retry_delete:
- /*
- * Even if the timer is not longer accessible from other tasks
- * it still might be armed and queued in the underlying timer
- * mechanism. Worse, that timer mechanism might run the expiry
- * function concurrently.
- */
- if (timer_delete_hook(timer) == TIMER_RETRY) {
- /*
- * Timer is expired concurrently, prevent livelocks
- * and pointless spinning on RT.
- *
- * timer_wait_running() drops timer::it_lock, which opens
- * the possibility for another task to delete the timer.
- *
- * That's not possible here because this is invoked from
- * do_exit() only for the last thread of the thread group.
- * So no other task can access and delete that timer.
- */
- if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer))
- return;
-
- goto retry_delete;
- }
- hlist_del(&timer->list);
-
- posix_timer_cleanup_ignored(timer);
-
- /*
- * Setting timer::it_signal to NULL is technically not required
- * here as nothing can access the timer anymore legitimately via
- * the hash table. Set it to NULL nevertheless so that all deletion
- * paths are consistent.
- */
- WRITE_ONCE(timer->it_signal, NULL);
-
- spin_unlock_irqrestore(&timer->it_lock, flags);
- posix_timer_unhash_and_free(timer);
-}
-
-/*
* Invoked from do_exit() when the last thread of a thread group exits.
* At that point no other task can access the timers of the dying
* task anymore.
@@ -1089,18 +1086,26 @@ retry_delete:
void exit_itimers(struct task_struct *tsk)
{
struct hlist_head timers;
+ struct hlist_node *next;
+ struct k_itimer *timer;
+
+ /* Clear restore mode for exec() */
+ tsk->signal->timer_create_restore_ids = 0;
if (hlist_empty(&tsk->signal->posix_timers))
return;
/* Protect against concurrent read via /proc/$PID/timers */
- spin_lock_irq(&tsk->sighand->siglock);
- hlist_move_list(&tsk->signal->posix_timers, &timers);
- spin_unlock_irq(&tsk->sighand->siglock);
+ scoped_guard (spinlock_irq, &tsk->sighand->siglock)
+ hlist_move_list(&tsk->signal->posix_timers, &timers);
/* The timers are not longer accessible via tsk::signal */
- while (!hlist_empty(&timers))
- itimer_delete(hlist_entry(timers.first, struct k_itimer, list));
+ hlist_for_each_entry_safe(timer, next, &timers, list) {
+ scoped_guard (spinlock_irq, &timer->it_lock)
+ posix_timer_delete(timer);
+ posix_timer_unhash_and_free(timer);
+ cond_resched();
+ }
/*
* There should be no timers on the ignored list. itimer_delete() has
@@ -1545,3 +1550,26 @@ static const struct k_clock *clockid_to_kclock(const clockid_t id)
return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];
}
+
+static int __init posixtimer_init(void)
+{
+ unsigned long i, size;
+ unsigned int shift;
+
+ if (IS_ENABLED(CONFIG_BASE_SMALL))
+ size = 512;
+ else
+ size = roundup_pow_of_two(512 * num_possible_cpus());
+
+ timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets),
+ size, 0, 0, &shift, NULL, size, size);
+ size = 1UL << shift;
+ timer_hashmask = size - 1;
+
+ for (i = 0; i < size; i++) {
+ spin_lock_init(&timer_buckets[i].lock);
+ INIT_HLIST_HEAD(&timer_buckets[i].head);
+ }
+ return 0;
+}
+core_initcall(posixtimer_init);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index fcca4e72f1ef..cc15fe293719 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -263,8 +263,7 @@ void __init generic_sched_clock_init(void)
* Start the timer to keep sched_clock() properly updated and
* sets the initial epoch.
*/
- hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- sched_clock_timer.function = sched_clock_poll;
+ hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
}
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index e28f9210f8a1..a88b72b0f35e 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -100,7 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
void tick_setup_hrtimer_broadcast(void)
{
- hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- bctimer.function = bc_handler;
+ hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
clockevents_register_device(&ce_broadcast_hrtimer);
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fa058510af9c..c527b421c865 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1573,12 +1573,10 @@ void tick_setup_sched_timer(bool hrtimer)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
/* Emulate tick processing via per-CPU hrtimers: */
- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+ hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) {
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
tick_sched_flag_set(ts, TS_FLAG_HIGHRES);
- ts->sched_timer.function = tick_nohz_handler;
- }
/* Get the next period (per-CPU) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1e67d076f195..929846b8b45a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -682,20 +682,19 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act
}
/**
- * timekeeping_forward_now - update clock to the current time
+ * timekeeping_forward - update clock to given cycle now value
* @tk: Pointer to the timekeeper to update
+ * @cycle_now: Current clocksource read value
*
* Forward the current clock to update its state since the last call to
* update_wall_time(). This is useful before significant clock changes,
* as it avoids having to deal with this time offset explicitly.
*/
-static void timekeeping_forward_now(struct timekeeper *tk)
+static void timekeeping_forward(struct timekeeper *tk, u64 cycle_now)
{
- u64 cycle_now, delta;
+ u64 delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
+ tk->tkr_mono.clock->max_raw_delta);
- cycle_now = tk_clock_read(&tk->tkr_mono);
- delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
- tk->tkr_mono.clock->max_raw_delta);
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
@@ -711,6 +710,21 @@ static void timekeeping_forward_now(struct timekeeper *tk)
}
/**
+ * timekeeping_forward_now - update clock to the current time
+ * @tk: Pointer to the timekeeper to update
+ *
+ * Forward the current clock to update its state since the last call to
+ * update_wall_time(). This is useful before significant clock changes,
+ * as it avoids having to deal with this time offset explicitly.
+ */
+static void timekeeping_forward_now(struct timekeeper *tk)
+{
+ u64 cycle_now = tk_clock_read(&tk->tkr_mono);
+
+ timekeeping_forward(tk, cycle_now);
+}
+
+/**
* ktime_get_real_ts64 - Returns the time of day in a timespec64.
* @ts: pointer to the timespec to be set
*
@@ -2151,6 +2165,54 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
return offset;
}
+static u64 timekeeping_accumulate(struct timekeeper *tk, u64 offset,
+ enum timekeeping_adv_mode mode,
+ unsigned int *clock_set)
+{
+ int shift = 0, maxshift;
+
+ /*
+ * TK_ADV_FREQ indicates that adjtimex(2) directly set the
+ * frequency or the tick length.
+ *
+ * Accumulate the offset, so that the new multiplier starts from
+ * now. This is required as otherwise for offsets, which are
+ * smaller than tk::cycle_interval, timekeeping_adjust() could set
+ * xtime_nsec backwards, which subsequently causes time going
+ * backwards in the coarse time getters. But even for the case
+ * where offset is greater than tk::cycle_interval the periodic
+ * accumulation does not have much value.
+ *
+ * Also reset tk::ntp_error as it does not make sense to keep the
+ * old accumulated error around in this case.
+ */
+ if (mode == TK_ADV_FREQ) {
+ timekeeping_forward(tk, tk->tkr_mono.cycle_last + offset);
+ tk->ntp_error = 0;
+ return 0;
+ }
+
+ /*
+ * With NO_HZ we may have to accumulate many cycle_intervals
+ * (think "ticks") worth of time at once. To do this efficiently,
+ * we calculate the largest doubling multiple of cycle_intervals
+ * that is smaller than the offset. We then accumulate that
+ * chunk in one go, and then try to consume the next smaller
+ * doubled multiple.
+ */
+ shift = ilog2(offset) - ilog2(tk->cycle_interval);
+ shift = max(0, shift);
+ /* Bound shift to one less than what overflows tick_length */
+ maxshift = (64 - (ilog2(ntp_tick_length()) + 1)) - 1;
+ shift = min(shift, maxshift);
+ while (offset >= tk->cycle_interval) {
+ offset = logarithmic_accumulation(tk, offset, shift, clock_set);
+ if (offset < tk->cycle_interval << shift)
+ shift--;
+ }
+ return offset;
+}
+
/*
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
@@ -2160,7 +2222,6 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
struct timekeeper *tk = &tk_core.shadow_timekeeper;
struct timekeeper *real_tk = &tk_core.timekeeper;
unsigned int clock_set = 0;
- int shift = 0, maxshift;
u64 offset;
guard(raw_spinlock_irqsave)(&tk_core.lock);
@@ -2177,24 +2238,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
return false;
- /*
- * With NO_HZ we may have to accumulate many cycle_intervals
- * (think "ticks") worth of time at once. To do this efficiently,
- * we calculate the largest doubling multiple of cycle_intervals
- * that is smaller than the offset. We then accumulate that
- * chunk in one go, and then try to consume the next smaller
- * doubled multiple.
- */
- shift = ilog2(offset) - ilog2(tk->cycle_interval);
- shift = max(0, shift);
- /* Bound shift to one less than what overflows tick_length */
- maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
- shift = min(shift, maxshift);
- while (offset >= tk->cycle_interval) {
- offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
- if (offset < tk->cycle_interval<<shift)
- shift--;
- }
+ offset = timekeeping_accumulate(tk, offset, mode, &clock_set);
/* Adjust the multiplier to correct NTP error */
timekeeping_adjust(tk, offset);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1c311c46da50..cfbb46cc4e76 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -46,7 +46,7 @@ static void
print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
int idx, u64 now)
{
- SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function);
+ SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function);
SEQ_printf(m, ", S:%02x", timer->state);
SEQ_printf(m, "\n");
SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
@@ -98,7 +98,7 @@ next_one:
static void
print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
- SEQ_printf(m, " .base: %pK\n", base);
+ SEQ_printf(m, " .base: %p\n", base);
SEQ_printf(m, " .index: %d\n", base->index);
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 05d383143165..01c2ab1e8971 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -15,29 +15,29 @@
#include "timekeeping_internal.h"
-static inline void update_vdso_data(struct vdso_data *vdata,
- struct timekeeper *tk)
+static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk)
{
+ struct vdso_clock *vc = vdata->clock_data;
struct vdso_timestamp *vdso_ts;
u64 nsec, sec;
- vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
+ vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
- vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles;
+ vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles;
#endif
- vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
- vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
- vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
- vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
+ vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
+ vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
+ vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
+ vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
- vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles;
+ vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles;
#endif
- vdata[CS_RAW].mask = tk->tkr_raw.mask;
- vdata[CS_RAW].mult = tk->tkr_raw.mult;
- vdata[CS_RAW].shift = tk->tkr_raw.shift;
+ vc[CS_RAW].mask = tk->tkr_raw.mask;
+ vc[CS_RAW].mult = tk->tkr_raw.mult;
+ vc[CS_RAW].shift = tk->tkr_raw.shift;
/* CLOCK_MONOTONIC */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
nsec = tk->tkr_mono.xtime_nsec;
@@ -55,7 +55,7 @@ static inline void update_vdso_data(struct vdso_data *vdata,
nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift;
/* CLOCK_BOOTTIME */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
vdso_ts->sec = sec;
while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
@@ -65,19 +65,20 @@ static inline void update_vdso_data(struct vdso_data *vdata,
vdso_ts->nsec = nsec;
/* CLOCK_MONOTONIC_RAW */
- vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
+ vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
vdso_ts->sec = tk->raw_sec;
vdso_ts->nsec = tk->tkr_raw.xtime_nsec;
/* CLOCK_TAI */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI];
vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
}
void update_vsyscall(struct timekeeper *tk)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
+ struct vdso_clock *vc = vdata->clock_data;
struct vdso_timestamp *vdso_ts;
s32 clock_mode;
u64 nsec;
@@ -86,21 +87,21 @@ void update_vsyscall(struct timekeeper *tk)
vdso_write_begin(vdata);
clock_mode = tk->tkr_mono.clock->vdso_clock_mode;
- vdata[CS_HRES_COARSE].clock_mode = clock_mode;
- vdata[CS_RAW].clock_mode = clock_mode;
+ vc[CS_HRES_COARSE].clock_mode = clock_mode;
+ vc[CS_RAW].clock_mode = clock_mode;
/* CLOCK_REALTIME also required for time() */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
vdso_ts->sec = tk->xtime_sec;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
/* CLOCK_REALTIME_COARSE */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
vdso_ts->sec = tk->xtime_sec;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
/* CLOCK_MONOTONIC_COARSE */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
@@ -108,32 +109,31 @@ void update_vsyscall(struct timekeeper *tk)
/*
* Read without the seqlock held by clock_getres().
- * Note: No need to have a second copy.
*/
- WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
+ WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution);
/*
* If the current clocksource is not VDSO capable, then spare the
* update of the high resolution parts.
*/
if (clock_mode != VDSO_CLOCKMODE_NONE)
- update_vdso_data(vdata, tk);
+ update_vdso_time_data(vdata, tk);
__arch_update_vsyscall(vdata);
vdso_write_end(vdata);
- __arch_sync_vdso_data(vdata);
+ __arch_sync_vdso_time_data(vdata);
}
void update_vsyscall_tz(void)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
- vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest;
- vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime;
+ vdata->tz_minuteswest = sys_tz.tz_minuteswest;
+ vdata->tz_dsttime = sys_tz.tz_dsttime;
- __arch_sync_vdso_data(vdata);
+ __arch_sync_vdso_time_data(vdata);
}
/**
@@ -150,7 +150,7 @@ void update_vsyscall_tz(void)
*/
unsigned long vdso_update_begin(void)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
unsigned long flags = timekeeper_lock_irqsave();
vdso_write_begin(vdata);
@@ -167,9 +167,9 @@ unsigned long vdso_update_begin(void)
*/
void vdso_update_end(unsigned long flags)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
vdso_write_end(vdata);
- __arch_sync_vdso_data(vdata);
+ __arch_sync_vdso_time_data(vdata);
timekeeper_unlock_irqrestore(flags);
}
diff --git a/kernel/torture.c b/kernel/torture.c
index dede150aef01..3a0a8cc60401 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -792,6 +792,8 @@ static void torture_stutter_cleanup(void)
stutter_task = NULL;
}
+static unsigned long torture_init_jiffies;
+
static void
torture_print_module_parms(void)
{
@@ -821,6 +823,7 @@ bool torture_init_begin(char *ttype, int v)
torture_type = ttype;
verbose = v;
fullstop = FULLSTOP_DONTSTOP;
+ WRITE_ONCE(torture_init_jiffies, jiffies); // Lockless reads.
torture_print_module_parms();
return true;
}
@@ -837,6 +840,15 @@ void torture_init_end(void)
EXPORT_SYMBOL_GPL(torture_init_end);
/*
+ * Get the torture_init_begin()-time value of the jiffies counter.
+ */
+unsigned long get_torture_init_jiffies(void)
+{
+ return READ_ONCE(torture_init_jiffies);
+}
+EXPORT_SYMBOL_GPL(get_torture_init_jiffies);
+
+/*
* Clean up torture module. Please note that this is -not- invoked via
* the usual module_exit() mechanism, but rather by an explicit call from
* the client torture module. Returns true if a race with system shutdown
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d570b8b9c0a9..033fba0633cf 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -263,6 +263,18 @@ config FUNCTION_GRAPH_RETADDR
the function is called. This feature is off by default, and you can
enable it via the trace option funcgraph-retaddr.
+config FUNCTION_TRACE_ARGS
+ bool
+ depends on HAVE_FUNCTION_ARG_ACCESS_API
+ depends on DEBUG_INFO_BTF
+ default y
+ help
+ If supported with function argument access API and BTF, then
+ the function tracer and function graph tracer will support printing
+ of function arguments. This feature is off by default, and can be
+ enabled via the trace option func-args (for the function tracer) and
+ funcgraph-args (for the function graph tracer)
+
config DYNAMIC_FTRACE
bool "enable/disable function tracing dynamically"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index adc947587eb8..187dc37d61d4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -392,7 +392,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
.arg2_type = ARG_CONST_SIZE,
};
-static void __set_printk_clr_event(void)
+static void __set_printk_clr_event(struct work_struct *work)
{
/*
* This program might be calling bpf_trace_printk,
@@ -405,10 +405,11 @@ static void __set_printk_clr_event(void)
if (trace_set_clr_event("bpf_trace", "bpf_trace_printk", 1))
pr_warn_ratelimited("could not enable bpf_trace_printk events");
}
+static DECLARE_WORK(set_printk_work, __set_printk_clr_event);
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
- __set_printk_clr_event();
+ schedule_work(&set_printk_work);
return &bpf_trace_printk_proto;
}
@@ -451,7 +452,7 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = {
const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)
{
- __set_printk_clr_event();
+ schedule_work(&set_printk_work);
return &bpf_trace_vprintk_proto;
}
@@ -606,6 +607,11 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
.arg4_type = ARG_CONST_SIZE,
};
+const struct bpf_func_proto *bpf_get_perf_event_read_value_proto(void)
+{
+ return &bpf_perf_event_read_value_proto;
+}
+
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
u64 flags, struct perf_raw_record *raw,
@@ -843,7 +849,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc
if (unlikely(is_global_init(task)))
return -EPERM;
- if (!preemptible()) {
+ if (preempt_count() != 0 || irqs_disabled()) {
/* Do an early check on signal validity. Otherwise,
* the error is lost in deferred irq_work.
*/
@@ -1038,27 +1044,14 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
.arg1_type = ARG_PTR_TO_CTX,
};
-#ifdef CONFIG_X86_KERNEL_IBT
-static unsigned long get_entry_ip(unsigned long fentry_ip)
+static inline unsigned long get_entry_ip(unsigned long fentry_ip)
{
- u32 instr;
-
- /* We want to be extra safe in case entry ip is on the page edge,
- * but otherwise we need to avoid get_kernel_nofault()'s overhead.
- */
- if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) {
- if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE)))
- return fentry_ip;
- } else {
- instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE);
- }
- if (is_endbr(instr))
+#ifdef CONFIG_X86_KERNEL_IBT
+ if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE)))
fentry_ip -= ENDBR_INSN_SIZE;
+#endif
return fentry_ip;
}
-#else
-#define get_entry_ip(fentry_ip) fentry_ip
-#endif
BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
{
@@ -2345,10 +2338,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
{
struct module *mod;
- preempt_disable();
+ guard(rcu)();
mod = __module_address((unsigned long)btp);
module_put(mod);
- preempt_enable();
}
static __always_inline
@@ -2932,18 +2924,21 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3
u32 i, err = 0;
for (i = 0; i < addrs_cnt; i++) {
+ bool skip_add = false;
struct module *mod;
- preempt_disable();
- mod = __module_address(addrs[i]);
- /* Either no module or we it's already stored */
- if (!mod || has_module(&arr, mod)) {
- preempt_enable();
- continue;
+ scoped_guard(rcu) {
+ mod = __module_address(addrs[i]);
+ /* Either no module or it's already stored */
+ if (!mod || has_module(&arr, mod)) {
+ skip_add = true;
+ break; /* scoped_guard */
+ }
+ if (!try_module_get(mod))
+ err = -EINVAL;
}
- if (!try_module_get(mod))
- err = -EINVAL;
- preempt_enable();
+ if (skip_add)
+ continue;
if (err)
break;
err = add_module(&arr, mod);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 5dddfc2149f6..8d925cbdce3a 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -865,7 +865,7 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe
}
/*
- * After all architecures have selected HAVE_FUNCTION_GRAPH_FREGS, we can
+ * After all architectures have selected HAVE_FUNCTION_GRAPH_FREGS, we can
* leave only ftrace_return_to_handler(fregs).
*/
#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fc88e0688daf..92015de6203d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1293,6 +1293,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
void ftrace_free_filter(struct ftrace_ops *ops)
{
ftrace_ops_init(ops);
+ if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return;
free_ftrace_hash(ops->func_hash->filter_hash);
free_ftrace_hash(ops->func_hash->notrace_hash);
}
@@ -7016,6 +7018,7 @@ static int ftrace_process_locs(struct module *mod,
unsigned long *p;
unsigned long addr;
unsigned long flags = 0; /* Shut up gcc */
+ unsigned long pages;
int ret = -ENOMEM;
count = end - start;
@@ -7023,6 +7026,8 @@ static int ftrace_process_locs(struct module *mod,
if (!count)
return 0;
+ pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE);
+
/*
* Sorting mcount in vmlinux at build time depend on
* CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in
@@ -7067,7 +7072,9 @@ static int ftrace_process_locs(struct module *mod,
pg = start_pg;
while (p < end) {
unsigned long end_offset;
- addr = ftrace_call_adjust(*p++);
+
+ addr = *p++;
+
/*
* Some architecture linkers will pad between
* the different mcount_loc sections of different
@@ -7079,6 +7086,19 @@ static int ftrace_process_locs(struct module *mod,
continue;
}
+ /*
+ * If this is core kernel, make sure the address is in core
+ * or inittext, as weak functions get zeroed and KASLR can
+ * move them to something other than zero. It just will not
+ * move it to an area where kernel text is.
+ */
+ if (!mod && !(is_kernel_text(addr) || is_kernel_inittext(addr))) {
+ skipped++;
+ continue;
+ }
+
+ addr = ftrace_call_adjust(addr);
+
end_offset = (pg->index+1) * sizeof(pg->records[0]);
if (end_offset > PAGE_SIZE << pg->order) {
/* We should have allocated enough */
@@ -7118,11 +7138,41 @@ static int ftrace_process_locs(struct module *mod,
/* We should have used all pages unless we skipped some */
if (pg_unuse) {
- WARN_ON(!skipped);
+ unsigned long pg_remaining, remaining = 0;
+ unsigned long skip;
+
+ /* Count the number of entries unused and compare it to skipped. */
+ pg_remaining = (ENTRIES_PER_PAGE << pg->order) - pg->index;
+
+ if (!WARN(skipped < pg_remaining, "Extra allocated pages for ftrace")) {
+
+ skip = skipped - pg_remaining;
+
+ for (pg = pg_unuse; pg; pg = pg->next)
+ remaining += 1 << pg->order;
+
+ pages -= remaining;
+
+ skip = DIV_ROUND_UP(skip, ENTRIES_PER_PAGE);
+
+ /*
+ * Check to see if the number of pages remaining would
+ * just fit the number of entries skipped.
+ */
+ WARN(skip != remaining, "Extra allocated pages for ftrace: %lu with %lu skipped",
+ remaining, skipped);
+ }
/* Need to synchronize with ftrace_location_range() */
synchronize_rcu();
ftrace_free_pages(pg_unuse);
}
+
+ if (!mod) {
+ count -= skipped;
+ pr_info("ftrace: allocating %ld entries in %ld pages\n",
+ count, pages);
+ }
+
return ret;
}
@@ -7768,9 +7818,6 @@ void __init ftrace_init(void)
goto failed;
}
- pr_info("ftrace: allocating %ld entries in %ld pages\n",
- count, DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
-
ret = ftrace_process_locs(NULL,
__start_mcount_loc,
__stop_mcount_loc);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bb6089c2951e..d8d7b28e2c2f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -31,6 +31,7 @@
#include <asm/local64.h>
#include <asm/local.h>
+#include <asm/setup.h>
#include "trace.h"
@@ -48,9 +49,12 @@ static void update_pages_handler(struct work_struct *work);
struct ring_buffer_meta {
int magic;
- int struct_size;
- unsigned long text_addr;
- unsigned long data_addr;
+ int struct_sizes;
+ unsigned long total_size;
+ unsigned long buffers_offset;
+};
+
+struct ring_buffer_cpu_meta {
unsigned long first_buffer;
unsigned long head_buffer;
unsigned long commit_buffer;
@@ -517,7 +521,7 @@ struct ring_buffer_per_cpu {
struct mutex mapping_lock;
unsigned long *subbuf_ids; /* ID to subbuf VA */
struct trace_buffer_meta *meta_page;
- struct ring_buffer_meta *ring_meta;
+ struct ring_buffer_cpu_meta *ring_meta;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
@@ -550,8 +554,7 @@ struct trace_buffer {
unsigned long range_addr_start;
unsigned long range_addr_end;
- long last_text_delta;
- long last_data_delta;
+ struct ring_buffer_meta *meta;
unsigned int subbuf_size;
unsigned int subbuf_order;
@@ -1271,7 +1274,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
rb_set_list_to_head(head->list.prev);
if (cpu_buffer->ring_meta) {
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
meta->head_buffer = (unsigned long)head->page;
}
}
@@ -1569,7 +1572,7 @@ out_locked:
static unsigned long
rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
{
- addr += sizeof(struct ring_buffer_meta) +
+ addr += sizeof(struct ring_buffer_cpu_meta) +
sizeof(int) * nr_subbufs;
return ALIGN(addr, subbuf_size);
}
@@ -1580,19 +1583,22 @@ rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
{
int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
- unsigned long ptr = buffer->range_addr_start;
- struct ring_buffer_meta *meta;
+ struct ring_buffer_cpu_meta *meta;
+ struct ring_buffer_meta *bmeta;
+ unsigned long ptr;
int nr_subbufs;
- if (!ptr)
+ bmeta = buffer->meta;
+ if (!bmeta)
return NULL;
+ ptr = (unsigned long)bmeta + bmeta->buffers_offset;
+ meta = (struct ring_buffer_cpu_meta *)ptr;
+
/* When nr_pages passed in is zero, the first meta has already been initialized */
if (!nr_pages) {
- meta = (struct ring_buffer_meta *)ptr;
nr_subbufs = meta->nr_subbufs;
} else {
- meta = NULL;
/* Include the reader page */
nr_subbufs = nr_pages + 1;
}
@@ -1624,7 +1630,7 @@ static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
}
/* Return the start of subbufs given the meta pointer */
-static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta)
+static void *rb_subbufs_from_meta(struct ring_buffer_cpu_meta *meta)
{
int subbuf_size = meta->subbuf_size;
unsigned long ptr;
@@ -1640,7 +1646,7 @@ static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta)
*/
static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
{
- struct ring_buffer_meta *meta;
+ struct ring_buffer_cpu_meta *meta;
unsigned long ptr;
int subbuf_size;
@@ -1666,14 +1672,77 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
}
/*
+ * See if the existing memory contains a valid meta section.
+ * if so, use that, otherwise initialize it.
+ */
+static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size)
+{
+ unsigned long ptr = buffer->range_addr_start;
+ struct ring_buffer_meta *bmeta;
+ unsigned long total_size;
+ int struct_sizes;
+
+ bmeta = (struct ring_buffer_meta *)ptr;
+ buffer->meta = bmeta;
+
+ total_size = buffer->range_addr_end - buffer->range_addr_start;
+
+ struct_sizes = sizeof(struct ring_buffer_cpu_meta);
+ struct_sizes |= sizeof(*bmeta) << 16;
+
+ /* The first buffer will start word size after the meta page */
+ ptr += sizeof(*bmeta);
+ ptr = ALIGN(ptr, sizeof(long));
+ ptr += scratch_size;
+
+ if (bmeta->magic != RING_BUFFER_META_MAGIC) {
+ pr_info("Ring buffer boot meta mismatch of magic\n");
+ goto init;
+ }
+
+ if (bmeta->struct_sizes != struct_sizes) {
+ pr_info("Ring buffer boot meta mismatch of struct size\n");
+ goto init;
+ }
+
+ if (bmeta->total_size != total_size) {
+ pr_info("Ring buffer boot meta mismatch of total size\n");
+ goto init;
+ }
+
+ if (bmeta->buffers_offset > bmeta->total_size) {
+ pr_info("Ring buffer boot meta mismatch of offset outside of total size\n");
+ goto init;
+ }
+
+ if (bmeta->buffers_offset != (void *)ptr - (void *)bmeta) {
+ pr_info("Ring buffer boot meta mismatch of first buffer offset\n");
+ goto init;
+ }
+
+ return true;
+
+ init:
+ bmeta->magic = RING_BUFFER_META_MAGIC;
+ bmeta->struct_sizes = struct_sizes;
+ bmeta->total_size = total_size;
+ bmeta->buffers_offset = (void *)ptr - (void *)bmeta;
+
+ /* Zero out the scatch pad */
+ memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta));
+
+ return false;
+}
+
+/*
* See if the existing memory contains valid ring buffer data.
* As the previous kernel must be the same as this kernel, all
* the calculations (size of buffers and number of buffers)
* must be the same.
*/
-static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
- struct trace_buffer *buffer, int nr_pages,
- unsigned long *subbuf_mask)
+static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
+ struct trace_buffer *buffer, int nr_pages,
+ unsigned long *subbuf_mask)
{
int subbuf_size = PAGE_SIZE;
struct buffer_data_page *subbuf;
@@ -1684,20 +1753,6 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
if (!subbuf_mask)
return false;
- /* Check the meta magic and meta struct size */
- if (meta->magic != RING_BUFFER_META_MAGIC ||
- meta->struct_size != sizeof(*meta)) {
- pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu);
- return false;
- }
-
- /* The subbuffer's size and number of subbuffers must match */
- if (meta->subbuf_size != subbuf_size ||
- meta->nr_subbufs != nr_pages + 1) {
- pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu);
- return false;
- }
-
buffers_start = meta->first_buffer;
buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
@@ -1743,7 +1798,7 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
return true;
}
-static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf);
+static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf);
static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu,
unsigned long long *timestamp, u64 *delta_ptr)
@@ -1810,7 +1865,7 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
/* If the meta data has been validated, now validate the events */
static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
struct buffer_page *head_page;
unsigned long entry_bytes = 0;
unsigned long entries = 0;
@@ -1891,24 +1946,13 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
}
}
-/* Used to calculate data delta */
-static char rb_data_ptr[] = "";
-
-#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr)
-#define THIS_DATA_PTR ((unsigned long)rb_data_ptr)
-
-static void rb_meta_init_text_addr(struct ring_buffer_meta *meta)
-{
- meta->text_addr = THIS_TEXT_PTR;
- meta->data_addr = THIS_DATA_PTR;
-}
-
-static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
+static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size)
{
- struct ring_buffer_meta *meta;
+ struct ring_buffer_cpu_meta *meta;
unsigned long *subbuf_mask;
unsigned long delta;
void *subbuf;
+ bool valid = false;
int cpu;
int i;
@@ -1916,20 +1960,21 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL);
/* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */
+ if (rb_meta_init(buffer, scratch_size))
+ valid = true;
+
for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
void *next_meta;
meta = rb_range_meta(buffer, nr_pages, cpu);
- if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) {
+ if (valid && rb_cpu_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) {
/* Make the mappings match the current address */
subbuf = rb_subbufs_from_meta(meta);
delta = (unsigned long)subbuf - meta->first_buffer;
meta->first_buffer += delta;
meta->head_buffer += delta;
meta->commit_buffer += delta;
- buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr;
- buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr;
continue;
}
@@ -1940,16 +1985,12 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
memset(meta, 0, next_meta - (void *)meta);
- meta->magic = RING_BUFFER_META_MAGIC;
- meta->struct_size = sizeof(*meta);
-
meta->nr_subbufs = nr_pages + 1;
meta->subbuf_size = PAGE_SIZE;
subbuf = rb_subbufs_from_meta(meta);
meta->first_buffer = (unsigned long)subbuf;
- rb_meta_init_text_addr(meta);
/*
* The buffers[] array holds the order of the sub-buffers
@@ -1971,7 +2012,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
static void *rbm_start(struct seq_file *m, loff_t *pos)
{
struct ring_buffer_per_cpu *cpu_buffer = m->private;
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
unsigned long val;
if (!meta)
@@ -1996,7 +2037,7 @@ static void *rbm_next(struct seq_file *m, void *v, loff_t *pos)
static int rbm_show(struct seq_file *m, void *v)
{
struct ring_buffer_per_cpu *cpu_buffer = m->private;
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
unsigned long val = (unsigned long)v;
if (val == 1) {
@@ -2045,7 +2086,7 @@ int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, in
static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage)
{
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
if (meta->head_buffer == (unsigned long)bpage->page)
cpu_buffer->head_page = bpage;
@@ -2060,7 +2101,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
{
struct trace_buffer *buffer = cpu_buffer->buffer;
- struct ring_buffer_meta *meta = NULL;
+ struct ring_buffer_cpu_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
gfp_t mflags;
@@ -2184,7 +2225,7 @@ static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct ring_buffer_meta *meta;
+ struct ring_buffer_cpu_meta *meta;
struct buffer_page *bpage;
struct page *page;
int ret;
@@ -2313,6 +2354,7 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long end,
+ unsigned long scratch_size,
struct lock_class_key *key)
{
struct trace_buffer *buffer;
@@ -2355,10 +2397,23 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
/* If start/end are specified, then that overrides size */
if (start && end) {
+ unsigned long buffers_start;
unsigned long ptr;
int n;
- size = end - start;
+ /* Make sure that start is word aligned */
+ start = ALIGN(start, sizeof(long));
+
+ /* scratch_size needs to be aligned too */
+ scratch_size = ALIGN(scratch_size, sizeof(long));
+
+ /* Subtract the buffer meta data and word aligned */
+ buffers_start = start + sizeof(struct ring_buffer_cpu_meta);
+ buffers_start = ALIGN(buffers_start, sizeof(long));
+ buffers_start += scratch_size;
+
+ /* Calculate the size for the per CPU data */
+ size = end - buffers_start;
size = size / nr_cpu_ids;
/*
@@ -2368,7 +2423,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
* needed, plus account for the integer array index that
* will be appended to the meta data.
*/
- nr_pages = (size - sizeof(struct ring_buffer_meta)) /
+ nr_pages = (size - sizeof(struct ring_buffer_cpu_meta)) /
(subbuf_size + sizeof(int));
/* Need at least two pages plus the reader page */
if (nr_pages < 3)
@@ -2376,8 +2431,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
again:
/* Make sure that the size fits aligned */
- for (n = 0, ptr = start; n < nr_cpu_ids; n++) {
- ptr += sizeof(struct ring_buffer_meta) +
+ for (n = 0, ptr = buffers_start; n < nr_cpu_ids; n++) {
+ ptr += sizeof(struct ring_buffer_cpu_meta) +
sizeof(int) * nr_pages;
ptr = ALIGN(ptr, subbuf_size);
ptr += subbuf_size * nr_pages;
@@ -2394,7 +2449,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
buffer->range_addr_start = start;
buffer->range_addr_end = end;
- rb_range_meta_init(buffer, nr_pages);
+ rb_range_meta_init(buffer, nr_pages, scratch_size);
} else {
/* need at least two pages */
@@ -2447,7 +2502,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
struct lock_class_key *key)
{
/* Default buffer page size - one system page */
- return alloc_buffer(size, flags, 0, 0, 0,key);
+ return alloc_buffer(size, flags, 0, 0, 0, 0, key);
}
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
@@ -2459,6 +2514,7 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
* @order: sub-buffer order
* @start: start of allocated range
* @range_size: size of allocated range
+ * @scratch_size: size of scratch area (for preallocated memory buffers)
* @key: ring buffer reader_lock_key.
*
* Currently the only flag that is available is the RB_FL_OVERWRITE
@@ -2469,32 +2525,29 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long range_size,
+ unsigned long scratch_size,
struct lock_class_key *key)
{
- return alloc_buffer(size, flags, order, start, start + range_size, key);
+ return alloc_buffer(size, flags, order, start, start + range_size,
+ scratch_size, key);
}
-/**
- * ring_buffer_last_boot_delta - return the delta offset from last boot
- * @buffer: The buffer to return the delta from
- * @text: Return text delta
- * @data: Return data delta
- *
- * Returns: The true if the delta is non zero
- */
-bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
- long *data)
+void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size)
{
- if (!buffer)
- return false;
+ struct ring_buffer_meta *meta;
+ void *ptr;
- if (!buffer->last_text_delta)
- return false;
+ if (!buffer || !buffer->meta)
+ return NULL;
- *text = buffer->last_text_delta;
- *data = buffer->last_data_delta;
+ meta = buffer->meta;
- return true;
+ ptr = (void *)ALIGN((unsigned long)meta + sizeof(*meta), sizeof(long));
+
+ if (size)
+ *size = (void *)meta + meta->buffers_offset - ptr;
+
+ return ptr;
}
/**
@@ -3105,7 +3158,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
}
/* Return the index into the sub-buffers for a given sub-buffer */
-static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf)
+static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf)
{
void *subbuf_array;
@@ -3117,7 +3170,7 @@ static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf)
static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *next_page)
{
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
unsigned long old_head = (unsigned long)next_page->page;
unsigned long new_head;
@@ -3134,7 +3187,7 @@ static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer,
static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *reader)
{
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
void *old_reader = cpu_buffer->reader_page->page;
void *new_reader = reader->page;
int id;
@@ -3763,7 +3816,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
if (cpu_buffer->ring_meta) {
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page;
}
/* add barrier to keep gcc from optimizing too much */
@@ -5318,7 +5371,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
* moving it. The page before the header page has the
* flag bit '1' set if it is pointing to the page we want.
* but if the writer is in the process of moving it
- * than it will be '2' or already moved '0'.
+ * then it will be '2' or already moved '0'.
*/
ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
@@ -6016,7 +6069,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
if (cpu_buffer->mapped) {
rb_update_meta_page(cpu_buffer);
if (cpu_buffer->ring_meta) {
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
meta->commit_buffer = meta->head_buffer;
}
}
@@ -6050,7 +6103,6 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- struct ring_buffer_meta *meta;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
@@ -6069,11 +6121,6 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
atomic_dec(&cpu_buffer->record_disabled);
atomic_dec(&cpu_buffer->resize_disabled);
- /* Make sure persistent meta now uses this buffer's addresses */
- meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
- if (meta)
- rb_meta_init_text_addr(meta);
-
mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -6088,7 +6135,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct ring_buffer_meta *meta;
int cpu;
/* prevent another thread from changing buffer sizes */
@@ -6116,11 +6162,6 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
reset_disabled_cpu_buffer(cpu_buffer);
- /* Make sure persistent meta now uses this buffer's addresses */
- meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
- if (meta)
- rb_meta_init_text_addr(meta);
-
atomic_dec(&cpu_buffer->record_disabled);
atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
}
@@ -7411,9 +7452,9 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)
/* Ignore dropped events before test starts. */
if (started) {
if (nested)
- data->bytes_dropped += len;
- else
data->bytes_dropped_nested += len;
+ else
+ data->bytes_dropped += len;
}
return len;
}
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 8226352a0062..b39f36013ef2 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -27,6 +27,13 @@ menuconfig RV
source "kernel/trace/rv/monitors/wip/Kconfig"
source "kernel/trace/rv/monitors/wwnr/Kconfig"
+source "kernel/trace/rv/monitors/sched/Kconfig"
+source "kernel/trace/rv/monitors/tss/Kconfig"
+source "kernel/trace/rv/monitors/sco/Kconfig"
+source "kernel/trace/rv/monitors/snroc/Kconfig"
+source "kernel/trace/rv/monitors/scpd/Kconfig"
+source "kernel/trace/rv/monitors/snep/Kconfig"
+source "kernel/trace/rv/monitors/sncid/Kconfig"
# Add new monitors here
config RV_REACTORS
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 188b64668e1f..f9b2cd0483c3 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -5,6 +5,13 @@ ccflags-y += -I $(src) # needed for trace events
obj-$(CONFIG_RV) += rv.o
obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
+obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o
+obj-$(CONFIG_RV_MON_TSS) += monitors/tss/tss.o
+obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o
+obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o
+obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o
+obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o
+obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o
# Add new monitors here
obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig
new file mode 100644
index 000000000000..ae3eb410abd7
--- /dev/null
+++ b/kernel/trace/rv/monitors/sched/Kconfig
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SCHED
+ depends on RV
+ bool "sched monitor"
+ help
+ Collection of monitors to check the scheduler behaves according to specifications.
+ Enable this to enable all scheduler specification supported by the current kernel.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c
new file mode 100644
index 000000000000..905e03c3c934
--- /dev/null
+++ b/kernel/trace/rv/monitors/sched/sched.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+
+#define MODULE_NAME "sched"
+
+#include "sched.h"
+
+struct rv_monitor rv_sched;
+
+struct rv_monitor rv_sched = {
+ .name = "sched",
+ .description = "container for several scheduler monitor specifications.",
+ .enable = NULL,
+ .disable = NULL,
+ .reset = NULL,
+ .enabled = 0,
+};
+
+static int __init register_sched(void)
+{
+ rv_register_monitor(&rv_sched, NULL);
+ return 0;
+}
+
+static void __exit unregister_sched(void)
+{
+ rv_unregister_monitor(&rv_sched);
+}
+
+module_init(register_sched);
+module_exit(unregister_sched);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sched: container for several scheduler monitor specifications.");
diff --git a/kernel/trace/rv/monitors/sched/sched.h b/kernel/trace/rv/monitors/sched/sched.h
new file mode 100644
index 000000000000..ba148dd8d48b
--- /dev/null
+++ b/kernel/trace/rv/monitors/sched/sched.h
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+extern struct rv_monitor rv_sched;
diff --git a/kernel/trace/rv/monitors/sco/Kconfig b/kernel/trace/rv/monitors/sco/Kconfig
new file mode 100644
index 000000000000..097c96cccdd7
--- /dev/null
+++ b/kernel/trace/rv/monitors/sco/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SCO
+ depends on RV
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "sco monitor"
+ help
+ Monitor to ensure sched_set_state happens only in thread context.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c
new file mode 100644
index 000000000000..4cff59220bfc
--- /dev/null
+++ b/kernel/trace/rv/monitors/sco/sco.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "sco"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "sco.h"
+
+static struct rv_monitor rv_sco;
+DECLARE_DA_MON_PER_CPU(sco, unsigned char);
+
+static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
+{
+ da_handle_start_event_sco(sched_set_state_sco);
+}
+
+static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+{
+ da_handle_event_sco(schedule_entry_sco);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+{
+ da_handle_start_event_sco(schedule_exit_sco);
+}
+
+static int enable_sco(void)
+{
+ int retval;
+
+ retval = da_monitor_init_sco();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state);
+ rv_attach_trace_probe("sco", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("sco", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_sco(void)
+{
+ rv_sco.enabled = 0;
+
+ rv_detach_trace_probe("sco", sched_set_state_tp, handle_sched_set_state);
+ rv_detach_trace_probe("sco", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("sco", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy_sco();
+}
+
+static struct rv_monitor rv_sco = {
+ .name = "sco",
+ .description = "scheduling context operations.",
+ .enable = enable_sco,
+ .disable = disable_sco,
+ .reset = da_monitor_reset_all_sco,
+ .enabled = 0,
+};
+
+static int __init register_sco(void)
+{
+ rv_register_monitor(&rv_sco, &rv_sched);
+ return 0;
+}
+
+static void __exit unregister_sco(void)
+{
+ rv_unregister_monitor(&rv_sco);
+}
+
+module_init(register_sco);
+module_exit(unregister_sco);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sco: scheduling context operations.");
diff --git a/kernel/trace/rv/monitors/sco/sco.h b/kernel/trace/rv/monitors/sco/sco.h
new file mode 100644
index 000000000000..7a4c1f2d5ca1
--- /dev/null
+++ b/kernel/trace/rv/monitors/sco/sco.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of sco automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_sco {
+ thread_context_sco = 0,
+ scheduling_context_sco,
+ state_max_sco
+};
+
+#define INVALID_STATE state_max_sco
+
+enum events_sco {
+ sched_set_state_sco = 0,
+ schedule_entry_sco,
+ schedule_exit_sco,
+ event_max_sco
+};
+
+struct automaton_sco {
+ char *state_names[state_max_sco];
+ char *event_names[event_max_sco];
+ unsigned char function[state_max_sco][event_max_sco];
+ unsigned char initial_state;
+ bool final_states[state_max_sco];
+};
+
+static const struct automaton_sco automaton_sco = {
+ .state_names = {
+ "thread_context",
+ "scheduling_context"
+ },
+ .event_names = {
+ "sched_set_state",
+ "schedule_entry",
+ "schedule_exit"
+ },
+ .function = {
+ { thread_context_sco, scheduling_context_sco, INVALID_STATE },
+ { INVALID_STATE, INVALID_STATE, thread_context_sco },
+ },
+ .initial_state = thread_context_sco,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/sco/sco_trace.h b/kernel/trace/rv/monitors/sco/sco_trace.h
new file mode 100644
index 000000000000..b711cd9024ec
--- /dev/null
+++ b/kernel/trace/rv/monitors/sco/sco_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SCO
+DEFINE_EVENT(event_da_monitor, event_sco,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_sco,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_SCO */
diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig
new file mode 100644
index 000000000000..b9114fbf680f
--- /dev/null
+++ b/kernel/trace/rv/monitors/scpd/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SCPD
+ depends on RV
+ depends on PREEMPT_TRACER
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "scpd monitor"
+ help
+ Monitor to ensure schedule is called with preemption disabled.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c
new file mode 100644
index 000000000000..cbdd6a5f8d7f
--- /dev/null
+++ b/kernel/trace/rv/monitors/scpd/scpd.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "scpd"
+
+#include <trace/events/sched.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "scpd.h"
+
+static struct rv_monitor rv_scpd;
+DECLARE_DA_MON_PER_CPU(scpd, unsigned char);
+
+static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_scpd(preempt_disable_scpd);
+}
+
+static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event_scpd(preempt_enable_scpd);
+}
+
+static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+{
+ da_handle_event_scpd(schedule_entry_scpd);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+{
+ da_handle_event_scpd(schedule_exit_scpd);
+}
+
+static int enable_scpd(void)
+{
+ int retval;
+
+ retval = da_monitor_init_scpd();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("scpd", preempt_disable, handle_preempt_disable);
+ rv_attach_trace_probe("scpd", preempt_enable, handle_preempt_enable);
+ rv_attach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_scpd(void)
+{
+ rv_scpd.enabled = 0;
+
+ rv_detach_trace_probe("scpd", preempt_disable, handle_preempt_disable);
+ rv_detach_trace_probe("scpd", preempt_enable, handle_preempt_enable);
+ rv_detach_trace_probe("scpd", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("scpd", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy_scpd();
+}
+
+static struct rv_monitor rv_scpd = {
+ .name = "scpd",
+ .description = "schedule called with preemption disabled.",
+ .enable = enable_scpd,
+ .disable = disable_scpd,
+ .reset = da_monitor_reset_all_scpd,
+ .enabled = 0,
+};
+
+static int __init register_scpd(void)
+{
+ rv_register_monitor(&rv_scpd, &rv_sched);
+ return 0;
+}
+
+static void __exit unregister_scpd(void)
+{
+ rv_unregister_monitor(&rv_scpd);
+}
+
+module_init(register_scpd);
+module_exit(unregister_scpd);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("scpd: schedule called with preemption disabled.");
diff --git a/kernel/trace/rv/monitors/scpd/scpd.h b/kernel/trace/rv/monitors/scpd/scpd.h
new file mode 100644
index 000000000000..295f735a5811
--- /dev/null
+++ b/kernel/trace/rv/monitors/scpd/scpd.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of scpd automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_scpd {
+ cant_sched_scpd = 0,
+ can_sched_scpd,
+ state_max_scpd
+};
+
+#define INVALID_STATE state_max_scpd
+
+enum events_scpd {
+ preempt_disable_scpd = 0,
+ preempt_enable_scpd,
+ schedule_entry_scpd,
+ schedule_exit_scpd,
+ event_max_scpd
+};
+
+struct automaton_scpd {
+ char *state_names[state_max_scpd];
+ char *event_names[event_max_scpd];
+ unsigned char function[state_max_scpd][event_max_scpd];
+ unsigned char initial_state;
+ bool final_states[state_max_scpd];
+};
+
+static const struct automaton_scpd automaton_scpd = {
+ .state_names = {
+ "cant_sched",
+ "can_sched"
+ },
+ .event_names = {
+ "preempt_disable",
+ "preempt_enable",
+ "schedule_entry",
+ "schedule_exit"
+ },
+ .function = {
+ { can_sched_scpd, INVALID_STATE, INVALID_STATE, INVALID_STATE },
+ { INVALID_STATE, cant_sched_scpd, can_sched_scpd, can_sched_scpd },
+ },
+ .initial_state = cant_sched_scpd,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/scpd/scpd_trace.h b/kernel/trace/rv/monitors/scpd/scpd_trace.h
new file mode 100644
index 000000000000..6b0f4aa4732e
--- /dev/null
+++ b/kernel/trace/rv/monitors/scpd/scpd_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SCPD
+DEFINE_EVENT(event_da_monitor, event_scpd,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_scpd,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_SCPD */
diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/sncid/Kconfig
new file mode 100644
index 000000000000..76bcfef4fd10
--- /dev/null
+++ b/kernel/trace/rv/monitors/sncid/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SNCID
+ depends on RV
+ depends on IRQSOFF_TRACER
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "sncid monitor"
+ help
+ Monitor to ensure schedule is not called with interrupt disabled.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c
new file mode 100644
index 000000000000..f5037cd6214c
--- /dev/null
+++ b/kernel/trace/rv/monitors/sncid/sncid.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "sncid"
+
+#include <trace/events/sched.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "sncid.h"
+
+static struct rv_monitor rv_sncid;
+DECLARE_DA_MON_PER_CPU(sncid, unsigned char);
+
+static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_sncid(irq_disable_sncid);
+}
+
+static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event_sncid(irq_enable_sncid);
+}
+
+static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+{
+ da_handle_start_event_sncid(schedule_entry_sncid);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+{
+ da_handle_start_event_sncid(schedule_exit_sncid);
+}
+
+static int enable_sncid(void)
+{
+ int retval;
+
+ retval = da_monitor_init_sncid();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("sncid", irq_disable, handle_irq_disable);
+ rv_attach_trace_probe("sncid", irq_enable, handle_irq_enable);
+ rv_attach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_sncid(void)
+{
+ rv_sncid.enabled = 0;
+
+ rv_detach_trace_probe("sncid", irq_disable, handle_irq_disable);
+ rv_detach_trace_probe("sncid", irq_enable, handle_irq_enable);
+ rv_detach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy_sncid();
+}
+
+static struct rv_monitor rv_sncid = {
+ .name = "sncid",
+ .description = "schedule not called with interrupt disabled.",
+ .enable = enable_sncid,
+ .disable = disable_sncid,
+ .reset = da_monitor_reset_all_sncid,
+ .enabled = 0,
+};
+
+static int __init register_sncid(void)
+{
+ rv_register_monitor(&rv_sncid, &rv_sched);
+ return 0;
+}
+
+static void __exit unregister_sncid(void)
+{
+ rv_unregister_monitor(&rv_sncid);
+}
+
+module_init(register_sncid);
+module_exit(unregister_sncid);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sncid: schedule not called with interrupt disabled.");
diff --git a/kernel/trace/rv/monitors/sncid/sncid.h b/kernel/trace/rv/monitors/sncid/sncid.h
new file mode 100644
index 000000000000..21304725142b
--- /dev/null
+++ b/kernel/trace/rv/monitors/sncid/sncid.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of sncid automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_sncid {
+ can_sched_sncid = 0,
+ cant_sched_sncid,
+ state_max_sncid
+};
+
+#define INVALID_STATE state_max_sncid
+
+enum events_sncid {
+ irq_disable_sncid = 0,
+ irq_enable_sncid,
+ schedule_entry_sncid,
+ schedule_exit_sncid,
+ event_max_sncid
+};
+
+struct automaton_sncid {
+ char *state_names[state_max_sncid];
+ char *event_names[event_max_sncid];
+ unsigned char function[state_max_sncid][event_max_sncid];
+ unsigned char initial_state;
+ bool final_states[state_max_sncid];
+};
+
+static const struct automaton_sncid automaton_sncid = {
+ .state_names = {
+ "can_sched",
+ "cant_sched"
+ },
+ .event_names = {
+ "irq_disable",
+ "irq_enable",
+ "schedule_entry",
+ "schedule_exit"
+ },
+ .function = {
+ { cant_sched_sncid, INVALID_STATE, can_sched_sncid, can_sched_sncid },
+ { INVALID_STATE, can_sched_sncid, INVALID_STATE, INVALID_STATE },
+ },
+ .initial_state = can_sched_sncid,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/sncid/sncid_trace.h b/kernel/trace/rv/monitors/sncid/sncid_trace.h
new file mode 100644
index 000000000000..3ce42a57671d
--- /dev/null
+++ b/kernel/trace/rv/monitors/sncid/sncid_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SNCID
+DEFINE_EVENT(event_da_monitor, event_sncid,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_sncid,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_SNCID */
diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig
new file mode 100644
index 000000000000..77527f971232
--- /dev/null
+++ b/kernel/trace/rv/monitors/snep/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SNEP
+ depends on RV
+ depends on PREEMPT_TRACER
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "snep monitor"
+ help
+ Monitor to ensure schedule does not enable preempt.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c
new file mode 100644
index 000000000000..0076ba6d7ea4
--- /dev/null
+++ b/kernel/trace/rv/monitors/snep/snep.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "snep"
+
+#include <trace/events/sched.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "snep.h"
+
+static struct rv_monitor rv_snep;
+DECLARE_DA_MON_PER_CPU(snep, unsigned char);
+
+static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event_snep(preempt_disable_snep);
+}
+
+static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_start_event_snep(preempt_enable_snep);
+}
+
+static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+{
+ da_handle_event_snep(schedule_entry_snep);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+{
+ da_handle_start_event_snep(schedule_exit_snep);
+}
+
+static int enable_snep(void)
+{
+ int retval;
+
+ retval = da_monitor_init_snep();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("snep", preempt_disable, handle_preempt_disable);
+ rv_attach_trace_probe("snep", preempt_enable, handle_preempt_enable);
+ rv_attach_trace_probe("snep", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("snep", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_snep(void)
+{
+ rv_snep.enabled = 0;
+
+ rv_detach_trace_probe("snep", preempt_disable, handle_preempt_disable);
+ rv_detach_trace_probe("snep", preempt_enable, handle_preempt_enable);
+ rv_detach_trace_probe("snep", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("snep", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy_snep();
+}
+
+static struct rv_monitor rv_snep = {
+ .name = "snep",
+ .description = "schedule does not enable preempt.",
+ .enable = enable_snep,
+ .disable = disable_snep,
+ .reset = da_monitor_reset_all_snep,
+ .enabled = 0,
+};
+
+static int __init register_snep(void)
+{
+ rv_register_monitor(&rv_snep, &rv_sched);
+ return 0;
+}
+
+static void __exit unregister_snep(void)
+{
+ rv_unregister_monitor(&rv_snep);
+}
+
+module_init(register_snep);
+module_exit(unregister_snep);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("snep: schedule does not enable preempt.");
diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h
new file mode 100644
index 000000000000..6d16b9ad931e
--- /dev/null
+++ b/kernel/trace/rv/monitors/snep/snep.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of snep automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_snep {
+ non_scheduling_context_snep = 0,
+ scheduling_contex_snep,
+ state_max_snep
+};
+
+#define INVALID_STATE state_max_snep
+
+enum events_snep {
+ preempt_disable_snep = 0,
+ preempt_enable_snep,
+ schedule_entry_snep,
+ schedule_exit_snep,
+ event_max_snep
+};
+
+struct automaton_snep {
+ char *state_names[state_max_snep];
+ char *event_names[event_max_snep];
+ unsigned char function[state_max_snep][event_max_snep];
+ unsigned char initial_state;
+ bool final_states[state_max_snep];
+};
+
+static const struct automaton_snep automaton_snep = {
+ .state_names = {
+ "non_scheduling_context",
+ "scheduling_contex"
+ },
+ .event_names = {
+ "preempt_disable",
+ "preempt_enable",
+ "schedule_entry",
+ "schedule_exit"
+ },
+ .function = {
+ { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, INVALID_STATE },
+ { INVALID_STATE, INVALID_STATE, INVALID_STATE, non_scheduling_context_snep },
+ },
+ .initial_state = non_scheduling_context_snep,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/snep/snep_trace.h b/kernel/trace/rv/monitors/snep/snep_trace.h
new file mode 100644
index 000000000000..01aad49a949a
--- /dev/null
+++ b/kernel/trace/rv/monitors/snep/snep_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SNEP
+DEFINE_EVENT(event_da_monitor, event_snep,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_snep,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_SNEP */
diff --git a/kernel/trace/rv/monitors/snroc/Kconfig b/kernel/trace/rv/monitors/snroc/Kconfig
new file mode 100644
index 000000000000..6e4365a2fea3
--- /dev/null
+++ b/kernel/trace/rv/monitors/snroc/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SNROC
+ depends on RV
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_ID
+ bool "snroc monitor"
+ help
+ Monitor to ensure sched_set_state happens only in the respective task's context.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c
new file mode 100644
index 000000000000..bb1f60d55296
--- /dev/null
+++ b/kernel/trace/rv/monitors/snroc/snroc.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "snroc"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "snroc.h"
+
+static struct rv_monitor rv_snroc;
+DECLARE_DA_MON_PER_TASK(snroc, unsigned char);
+
+static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
+{
+ da_handle_event_snroc(tsk, sched_set_state_snroc);
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ da_handle_start_event_snroc(prev, sched_switch_out_snroc);
+ da_handle_event_snroc(next, sched_switch_in_snroc);
+}
+
+static int enable_snroc(void)
+{
+ int retval;
+
+ retval = da_monitor_init_snroc();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state);
+ rv_attach_trace_probe("snroc", sched_switch, handle_sched_switch);
+
+ return 0;
+}
+
+static void disable_snroc(void)
+{
+ rv_snroc.enabled = 0;
+
+ rv_detach_trace_probe("snroc", sched_set_state_tp, handle_sched_set_state);
+ rv_detach_trace_probe("snroc", sched_switch, handle_sched_switch);
+
+ da_monitor_destroy_snroc();
+}
+
+static struct rv_monitor rv_snroc = {
+ .name = "snroc",
+ .description = "set non runnable on its own context.",
+ .enable = enable_snroc,
+ .disable = disable_snroc,
+ .reset = da_monitor_reset_all_snroc,
+ .enabled = 0,
+};
+
+static int __init register_snroc(void)
+{
+ rv_register_monitor(&rv_snroc, &rv_sched);
+ return 0;
+}
+
+static void __exit unregister_snroc(void)
+{
+ rv_unregister_monitor(&rv_snroc);
+}
+
+module_init(register_snroc);
+module_exit(unregister_snroc);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("snroc: set non runnable on its own context.");
diff --git a/kernel/trace/rv/monitors/snroc/snroc.h b/kernel/trace/rv/monitors/snroc/snroc.h
new file mode 100644
index 000000000000..c3650a2b1b10
--- /dev/null
+++ b/kernel/trace/rv/monitors/snroc/snroc.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of snroc automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_snroc {
+ other_context_snroc = 0,
+ own_context_snroc,
+ state_max_snroc
+};
+
+#define INVALID_STATE state_max_snroc
+
+enum events_snroc {
+ sched_set_state_snroc = 0,
+ sched_switch_in_snroc,
+ sched_switch_out_snroc,
+ event_max_snroc
+};
+
+struct automaton_snroc {
+ char *state_names[state_max_snroc];
+ char *event_names[event_max_snroc];
+ unsigned char function[state_max_snroc][event_max_snroc];
+ unsigned char initial_state;
+ bool final_states[state_max_snroc];
+};
+
+static const struct automaton_snroc automaton_snroc = {
+ .state_names = {
+ "other_context",
+ "own_context"
+ },
+ .event_names = {
+ "sched_set_state",
+ "sched_switch_in",
+ "sched_switch_out"
+ },
+ .function = {
+ { INVALID_STATE, own_context_snroc, INVALID_STATE },
+ { own_context_snroc, INVALID_STATE, other_context_snroc },
+ },
+ .initial_state = other_context_snroc,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/snroc/snroc_trace.h b/kernel/trace/rv/monitors/snroc/snroc_trace.h
new file mode 100644
index 000000000000..50114cef5122
--- /dev/null
+++ b/kernel/trace/rv/monitors/snroc/snroc_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SNROC
+DEFINE_EVENT(event_da_monitor_id, event_snroc,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_snroc,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_SNROC */
diff --git a/kernel/trace/rv/monitors/tss/Kconfig b/kernel/trace/rv/monitors/tss/Kconfig
new file mode 100644
index 000000000000..479f86f52e60
--- /dev/null
+++ b/kernel/trace/rv/monitors/tss/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_TSS
+ depends on RV
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "tss monitor"
+ help
+ Monitor to ensure sched_switch happens only in scheduling context.
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c
new file mode 100644
index 000000000000..542787e6524f
--- /dev/null
+++ b/kernel/trace/rv/monitors/tss/tss.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "tss"
+
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "tss.h"
+
+static struct rv_monitor rv_tss;
+DECLARE_DA_MON_PER_CPU(tss, unsigned char);
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ da_handle_event_tss(sched_switch_tss);
+}
+
+static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+{
+ da_handle_event_tss(schedule_entry_tss);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+{
+ da_handle_start_event_tss(schedule_exit_tss);
+}
+
+static int enable_tss(void)
+{
+ int retval;
+
+ retval = da_monitor_init_tss();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("tss", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("tss", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("tss", sched_exit_tp, handle_schedule_exit);
+
+ return 0;
+}
+
+static void disable_tss(void)
+{
+ rv_tss.enabled = 0;
+
+ rv_detach_trace_probe("tss", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("tss", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("tss", sched_exit_tp, handle_schedule_exit);
+
+ da_monitor_destroy_tss();
+}
+
+static struct rv_monitor rv_tss = {
+ .name = "tss",
+ .description = "task switch while scheduling.",
+ .enable = enable_tss,
+ .disable = disable_tss,
+ .reset = da_monitor_reset_all_tss,
+ .enabled = 0,
+};
+
+static int __init register_tss(void)
+{
+ rv_register_monitor(&rv_tss, &rv_sched);
+ return 0;
+}
+
+static void __exit unregister_tss(void)
+{
+ rv_unregister_monitor(&rv_tss);
+}
+
+module_init(register_tss);
+module_exit(unregister_tss);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("tss: task switch while scheduling.");
diff --git a/kernel/trace/rv/monitors/tss/tss.h b/kernel/trace/rv/monitors/tss/tss.h
new file mode 100644
index 000000000000..f0a36fda1b87
--- /dev/null
+++ b/kernel/trace/rv/monitors/tss/tss.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of tss automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_tss {
+ thread_tss = 0,
+ sched_tss,
+ state_max_tss
+};
+
+#define INVALID_STATE state_max_tss
+
+enum events_tss {
+ sched_switch_tss = 0,
+ schedule_entry_tss,
+ schedule_exit_tss,
+ event_max_tss
+};
+
+struct automaton_tss {
+ char *state_names[state_max_tss];
+ char *event_names[event_max_tss];
+ unsigned char function[state_max_tss][event_max_tss];
+ unsigned char initial_state;
+ bool final_states[state_max_tss];
+};
+
+static const struct automaton_tss automaton_tss = {
+ .state_names = {
+ "thread",
+ "sched"
+ },
+ .event_names = {
+ "sched_switch",
+ "schedule_entry",
+ "schedule_exit"
+ },
+ .function = {
+ { INVALID_STATE, sched_tss, INVALID_STATE },
+ { sched_tss, INVALID_STATE, thread_tss },
+ },
+ .initial_state = thread_tss,
+ .final_states = { 1, 0 },
+};
diff --git a/kernel/trace/rv/monitors/tss/tss_trace.h b/kernel/trace/rv/monitors/tss/tss_trace.h
new file mode 100644
index 000000000000..4619dbb50cc0
--- /dev/null
+++ b/kernel/trace/rv/monitors/tss/tss_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_TSS
+DEFINE_EVENT(event_da_monitor, event_tss,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_tss,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_TSS */
diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig
index 3ef664b5cd90..e464b9294865 100644
--- a/kernel/trace/rv/monitors/wip/Kconfig
+++ b/kernel/trace/rv/monitors/wip/Kconfig
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
config RV_MON_WIP
depends on RV
depends on PREEMPT_TRACER
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index db7389157c87..ed758fec8608 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -71,7 +71,7 @@ static struct rv_monitor rv_wip = {
static int __init register_wip(void)
{
- rv_register_monitor(&rv_wip);
+ rv_register_monitor(&rv_wip, NULL);
return 0;
}
diff --git a/kernel/trace/rv/monitors/wip/wip.h b/kernel/trace/rv/monitors/wip/wip.h
index 2e373f2c65ed..c7193748bf36 100644
--- a/kernel/trace/rv/monitors/wip/wip.h
+++ b/kernel/trace/rv/monitors/wip/wip.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Automatically generated C representation of wip automaton
* For further information about this format, see kernel documentation:
diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig
index ee741aa6d6b8..d3bfc20037db 100644
--- a/kernel/trace/rv/monitors/wwnr/Kconfig
+++ b/kernel/trace/rv/monitors/wwnr/Kconfig
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
config RV_MON_WWNR
depends on RV
select DA_MON_EVENTS_ID
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
index 3b16994a9984..172f31c4b0f3 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.c
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -70,7 +70,7 @@ static struct rv_monitor rv_wwnr = {
static int __init register_wwnr(void)
{
- rv_register_monitor(&rv_wwnr);
+ rv_register_monitor(&rv_wwnr, NULL);
return 0;
}
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.h b/kernel/trace/rv/monitors/wwnr/wwnr.h
index d0d9c4b8121b..0a59d23edf61 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.h
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Automatically generated C representation of wwnr automaton
* For further information about this format, see kernel documentation:
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 8657fc8806e7..50344aa9f7f9 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -162,7 +162,7 @@ struct dentry *get_monitors_root(void)
/*
* Interface for the monitor register.
*/
-static LIST_HEAD(rv_monitors_list);
+LIST_HEAD(rv_monitors_list);
static int task_monitor_count;
static bool task_monitor_slots[RV_PER_TASK_MONITORS];
@@ -207,6 +207,30 @@ void rv_put_task_monitor_slot(int slot)
}
/*
+ * Monitors with a parent are nested,
+ * Monitors without a parent could be standalone or containers.
+ */
+bool rv_is_nested_monitor(struct rv_monitor_def *mdef)
+{
+ return mdef->parent != NULL;
+}
+
+/*
+ * We set our list to have nested monitors listed after their parent
+ * if a monitor has a child element its a container.
+ * Containers can be also identified based on their function pointers:
+ * as they are not real monitors they do not need function definitions
+ * for enable()/disable(). Use this condition to find empty containers.
+ * Keep both conditions in case we have some non-compliant containers.
+ */
+bool rv_is_container_monitor(struct rv_monitor_def *mdef)
+{
+ struct rv_monitor_def *next = list_next_entry(mdef, list);
+
+ return next->parent == mdef->monitor || !mdef->monitor->enable;
+}
+
+/*
* This section collects the monitor/ files and folders.
*/
static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count,
@@ -229,7 +253,8 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
if (mdef->monitor->enabled) {
mdef->monitor->enabled = 0;
- mdef->monitor->disable();
+ if (mdef->monitor->disable)
+ mdef->monitor->disable();
/*
* Wait for the execution of all events to finish.
@@ -243,6 +268,60 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
return 0;
}
+static void rv_disable_single(struct rv_monitor_def *mdef)
+{
+ __rv_disable_monitor(mdef, true);
+}
+
+static int rv_enable_single(struct rv_monitor_def *mdef)
+{
+ int retval;
+
+ lockdep_assert_held(&rv_interface_lock);
+
+ if (mdef->monitor->enabled)
+ return 0;
+
+ retval = mdef->monitor->enable();
+
+ if (!retval)
+ mdef->monitor->enabled = 1;
+
+ return retval;
+}
+
+static void rv_disable_container(struct rv_monitor_def *mdef)
+{
+ struct rv_monitor_def *p = mdef;
+ int enabled = 0;
+
+ list_for_each_entry_continue(p, &rv_monitors_list, list) {
+ if (p->parent != mdef->monitor)
+ break;
+ enabled += __rv_disable_monitor(p, false);
+ }
+ if (enabled)
+ tracepoint_synchronize_unregister();
+ mdef->monitor->enabled = 0;
+}
+
+static int rv_enable_container(struct rv_monitor_def *mdef)
+{
+ struct rv_monitor_def *p = mdef;
+ int retval = 0;
+
+ list_for_each_entry_continue(p, &rv_monitors_list, list) {
+ if (retval || p->parent != mdef->monitor)
+ break;
+ retval = rv_enable_single(p);
+ }
+ if (retval)
+ rv_disable_container(mdef);
+ else
+ mdef->monitor->enabled = 1;
+ return retval;
+}
+
/**
* rv_disable_monitor - disable a given runtime monitor
* @mdef: Pointer to the monitor definition structure.
@@ -251,7 +330,11 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
*/
int rv_disable_monitor(struct rv_monitor_def *mdef)
{
- __rv_disable_monitor(mdef, true);
+ if (rv_is_container_monitor(mdef))
+ rv_disable_container(mdef);
+ else
+ rv_disable_single(mdef);
+
return 0;
}
@@ -265,15 +348,10 @@ int rv_enable_monitor(struct rv_monitor_def *mdef)
{
int retval;
- lockdep_assert_held(&rv_interface_lock);
-
- if (mdef->monitor->enabled)
- return 0;
-
- retval = mdef->monitor->enable();
-
- if (!retval)
- mdef->monitor->enabled = 1;
+ if (rv_is_container_monitor(mdef))
+ retval = rv_enable_container(mdef);
+ else
+ retval = rv_enable_single(mdef);
return retval;
}
@@ -336,9 +414,9 @@ static const struct file_operations interface_desc_fops = {
* the monitor dir, where the specific options of the monitor
* are exposed.
*/
-static int create_monitor_dir(struct rv_monitor_def *mdef)
+static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent)
{
- struct dentry *root = get_monitors_root();
+ struct dentry *root = parent ? parent->root_d : get_monitors_root();
const char *name = mdef->monitor->name;
struct dentry *tmp;
int retval;
@@ -377,7 +455,11 @@ static int monitors_show(struct seq_file *m, void *p)
{
struct rv_monitor_def *mon_def = p;
- seq_printf(m, "%s\n", mon_def->monitor->name);
+ if (mon_def->parent)
+ seq_printf(m, "%s:%s\n", mon_def->parent->name,
+ mon_def->monitor->name);
+ else
+ seq_printf(m, "%s\n", mon_def->monitor->name);
return 0;
}
@@ -514,7 +596,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
struct rv_monitor_def *mdef;
int retval = -EINVAL;
bool enable = true;
- char *ptr;
+ char *ptr, *tmp;
int len;
if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1)
@@ -541,6 +623,11 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
retval = -EINVAL;
+ /* we support 1 nesting level, trim the parent */
+ tmp = strstr(ptr, ":");
+ if (tmp)
+ ptr = tmp+1;
+
list_for_each_entry(mdef, &rv_monitors_list, list) {
if (strcmp(ptr, mdef->monitor->name) != 0)
continue;
@@ -613,7 +700,7 @@ static void reset_all_monitors(void)
struct rv_monitor_def *mdef;
list_for_each_entry(mdef, &rv_monitors_list, list) {
- if (mdef->monitor->enabled)
+ if (mdef->monitor->enabled && mdef->monitor->reset)
mdef->monitor->reset();
}
}
@@ -685,18 +772,19 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef)
/**
* rv_register_monitor - register a rv monitor.
* @monitor: The rv_monitor to be registered.
+ * @parent: The parent of the monitor to be registered, NULL if not nested.
*
* Returns 0 if successful, error otherwise.
*/
-int rv_register_monitor(struct rv_monitor *monitor)
+int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
{
- struct rv_monitor_def *r;
+ struct rv_monitor_def *r, *p = NULL;
int retval = 0;
if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) {
pr_info("Monitor %s has a name longer than %d\n", monitor->name,
MAX_RV_MONITOR_NAME_SIZE);
- return -1;
+ return -EINVAL;
}
mutex_lock(&rv_interface_lock);
@@ -704,11 +792,26 @@ int rv_register_monitor(struct rv_monitor *monitor)
list_for_each_entry(r, &rv_monitors_list, list) {
if (strcmp(monitor->name, r->monitor->name) == 0) {
pr_info("Monitor %s is already registered\n", monitor->name);
- retval = -1;
+ retval = -EEXIST;
goto out_unlock;
}
}
+ if (parent) {
+ list_for_each_entry(r, &rv_monitors_list, list) {
+ if (strcmp(parent->name, r->monitor->name) == 0) {
+ p = r;
+ break;
+ }
+ }
+ }
+
+ if (p && rv_is_nested_monitor(p)) {
+ pr_info("Parent monitor %s is already nested, cannot nest further\n",
+ parent->name);
+ return -EINVAL;
+ }
+
r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL);
if (!r) {
retval = -ENOMEM;
@@ -716,14 +819,19 @@ int rv_register_monitor(struct rv_monitor *monitor)
}
r->monitor = monitor;
+ r->parent = parent;
- retval = create_monitor_dir(r);
+ retval = create_monitor_dir(r, p);
if (retval) {
kfree(r);
goto out_unlock;
}
- list_add_tail(&r->list, &rv_monitors_list);
+ /* keep children close to the parent for easier visualisation */
+ if (p)
+ list_add(&r->list, &p->list);
+ else
+ list_add_tail(&r->list, &rv_monitors_list);
out_unlock:
mutex_unlock(&rv_interface_lock);
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index db6cb0913dbd..98fca0a1adbc 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -21,6 +21,7 @@ struct rv_interface {
#define MAX_RV_REACTOR_NAME_SIZE 32
extern struct mutex rv_interface_lock;
+extern struct list_head rv_monitors_list;
#ifdef CONFIG_RV_REACTORS
struct rv_reactor_def {
@@ -34,6 +35,7 @@ struct rv_reactor_def {
struct rv_monitor_def {
struct list_head list;
struct rv_monitor *monitor;
+ struct rv_monitor *parent;
struct dentry *root_d;
#ifdef CONFIG_RV_REACTORS
struct rv_reactor_def *rdef;
@@ -45,6 +47,8 @@ struct rv_monitor_def {
struct dentry *get_monitors_root(void);
int rv_disable_monitor(struct rv_monitor_def *mdef);
int rv_enable_monitor(struct rv_monitor_def *mdef);
+bool rv_is_container_monitor(struct rv_monitor_def *mdef);
+bool rv_is_nested_monitor(struct rv_monitor_def *mdef);
#ifdef CONFIG_RV_REACTORS
int reactor_populate_monitor(struct rv_monitor_def *mdef);
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index 7b49cbe388d4..9501ca886d83 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -158,8 +158,9 @@ static const struct seq_operations monitor_reactors_seq_ops = {
.show = monitor_reactor_show
};
-static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor_def *rdef,
- bool reacting)
+static void monitor_swap_reactors_single(struct rv_monitor_def *mdef,
+ struct rv_reactor_def *rdef,
+ bool reacting, bool nested)
{
bool monitor_enabled;
@@ -179,10 +180,31 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef, struct rv_reactor
mdef->reacting = reacting;
mdef->monitor->react = rdef->reactor->react;
- if (monitor_enabled)
+ /* enable only once if iterating through a container */
+ if (monitor_enabled && !nested)
rv_enable_monitor(mdef);
}
+static void monitor_swap_reactors(struct rv_monitor_def *mdef,
+ struct rv_reactor_def *rdef, bool reacting)
+{
+ struct rv_monitor_def *p = mdef;
+
+ if (rv_is_container_monitor(mdef))
+ list_for_each_entry_continue(p, &rv_monitors_list, list) {
+ if (p->parent != mdef->monitor)
+ break;
+ monitor_swap_reactors_single(p, rdef, reacting, true);
+ }
+ /*
+ * This call enables and disables the monitor if they were active.
+ * In case of a container, we already disabled all and will enable all.
+ * All nested monitors are enabled also if they were off, we may refine
+ * this logic in the future.
+ */
+ monitor_swap_reactors_single(mdef, rdef, reacting, false);
+}
+
static ssize_t
monitor_reactors_write(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index 5e65097423ba..422b75f58891 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -58,6 +58,11 @@ DECLARE_EVENT_CLASS(error_da_monitor,
);
#include <monitors/wip/wip_trace.h>
+#include <monitors/tss/tss_trace.h>
+#include <monitors/sco/sco_trace.h>
+#include <monitors/scpd/scpd_trace.h>
+#include <monitors/snep/snep_trace.h>
+#include <monitors/sncid/sncid_trace.h>
// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here
#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */
@@ -118,6 +123,7 @@ DECLARE_EVENT_CLASS(error_da_monitor_id,
);
#include <monitors/wwnr/wwnr_trace.h>
+#include <monitors/snroc/snroc_trace.h>
// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here
#endif /* CONFIG_DA_MON_EVENTS_ID */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0e6d517e74e0..103b193875b3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -49,6 +49,7 @@
#include <linux/fsnotify.h>
#include <linux/irq_work.h>
#include <linux/workqueue.h>
+#include <linux/sort.h>
#include <asm/setup.h> /* COMMAND_LINE_SIZE */
@@ -87,6 +88,7 @@ void __init disable_tracing_selftest(const char *reason)
static struct trace_iterator *tracepoint_print_iter;
int tracepoint_printk;
static bool tracepoint_printk_stop_on_boot __initdata;
+static bool traceoff_after_boot __initdata;
static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
/* For tracers that don't implement custom flags */
@@ -330,6 +332,13 @@ static int __init set_tracepoint_printk_stop(char *str)
}
__setup("tp_printk_stop_on_boot", set_tracepoint_printk_stop);
+static int __init set_traceoff_after_boot(char *str)
+{
+ traceoff_after_boot = true;
+ return 1;
+}
+__setup("traceoff_after_boot", set_traceoff_after_boot);
+
unsigned long long ns2usecs(u64 nsec)
{
nsec += 500;
@@ -2878,13 +2887,16 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
void
trace_function(struct trace_array *tr, unsigned long ip, unsigned long
- parent_ip, unsigned int trace_ctx)
+ parent_ip, unsigned int trace_ctx, struct ftrace_regs *fregs)
{
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
+ int size = sizeof(*entry);
+
+ size += FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long);
- event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
+ event = __trace_buffer_lock_reserve(buffer, TRACE_FN, size,
trace_ctx);
if (!event)
return;
@@ -2892,6 +2904,13 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long
entry->ip = ip;
entry->parent_ip = parent_ip;
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ if (fregs) {
+ for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++)
+ entry->args[i] = ftrace_regs_get_argument(fregs, i);
+ }
+#endif
+
if (static_branch_unlikely(&trace_function_exports_enabled))
ftrace_exports(event, TRACE_EXPORT_FUNCTION);
__buffer_unlock_commit(buffer, event);
@@ -4100,12 +4119,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
entries,
total,
buf->cpu,
- preempt_model_none() ? "server" :
- preempt_model_voluntary() ? "desktop" :
- preempt_model_full() ? "preempt" :
- preempt_model_lazy() ? "lazy" :
- preempt_model_rt() ? "preempt_rt" :
- "unknown",
+ preempt_model_str(),
/* These are reserved for later use */
0, 0, 0, 0);
#ifdef CONFIG_SMP
@@ -4193,7 +4207,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
* safe to use if the array has delta offsets
* Force printing via the fields.
*/
- if ((tr->text_delta || tr->data_delta) &&
+ if ((tr->text_delta) &&
event->type > __TRACE_LAST_TYPE)
return print_event_fields(iter, event);
@@ -5988,11 +6002,130 @@ ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
return __tracing_resize_ring_buffer(tr, size, cpu_id);
}
+struct trace_mod_entry {
+ unsigned long mod_addr;
+ char mod_name[MODULE_NAME_LEN];
+};
+
+struct trace_scratch {
+ unsigned long text_addr;
+ unsigned long nr_entries;
+ struct trace_mod_entry entries[];
+};
+
+static DEFINE_MUTEX(scratch_mutex);
+
+static int cmp_mod_entry(const void *key, const void *pivot)
+{
+ unsigned long addr = (unsigned long)key;
+ const struct trace_mod_entry *ent = pivot;
+
+ if (addr >= ent[0].mod_addr && addr < ent[1].mod_addr)
+ return 0;
+ else
+ return addr - ent->mod_addr;
+}
+
+/**
+ * trace_adjust_address() - Adjust prev boot address to current address.
+ * @tr: Persistent ring buffer's trace_array.
+ * @addr: Address in @tr which is adjusted.
+ */
+unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr)
+{
+ struct trace_module_delta *module_delta;
+ struct trace_scratch *tscratch;
+ struct trace_mod_entry *entry;
+ int idx = 0, nr_entries;
+
+ /* If we don't have last boot delta, return the address */
+ if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ return addr;
+
+ /* tr->module_delta must be protected by rcu. */
+ guard(rcu)();
+ tscratch = tr->scratch;
+ /* if there is no tscrach, module_delta must be NULL. */
+ module_delta = READ_ONCE(tr->module_delta);
+ if (!module_delta || tscratch->entries[0].mod_addr > addr)
+ return addr + tr->text_delta;
+
+ /* Note that entries must be sorted. */
+ nr_entries = tscratch->nr_entries;
+ if (nr_entries == 1 ||
+ tscratch->entries[nr_entries - 1].mod_addr < addr)
+ idx = nr_entries - 1;
+ else {
+ entry = __inline_bsearch((void *)addr,
+ tscratch->entries,
+ nr_entries - 1,
+ sizeof(tscratch->entries[0]),
+ cmp_mod_entry);
+ if (entry)
+ idx = entry - tscratch->entries;
+ }
+
+ return addr + module_delta->delta[idx];
+}
+
+#ifdef CONFIG_MODULES
+static int save_mod(struct module *mod, void *data)
+{
+ struct trace_array *tr = data;
+ struct trace_scratch *tscratch;
+ struct trace_mod_entry *entry;
+ unsigned int size;
+
+ tscratch = tr->scratch;
+ if (!tscratch)
+ return -1;
+ size = tr->scratch_size;
+
+ if (struct_size(tscratch, entries, tscratch->nr_entries + 1) > size)
+ return -1;
+
+ entry = &tscratch->entries[tscratch->nr_entries];
+
+ tscratch->nr_entries++;
+
+ entry->mod_addr = (unsigned long)mod->mem[MOD_TEXT].base;
+ strscpy(entry->mod_name, mod->name);
+
+ return 0;
+}
+#else
+static int save_mod(struct module *mod, void *data)
+{
+ return 0;
+}
+#endif
+
static void update_last_data(struct trace_array *tr)
{
- if (!tr->text_delta && !tr->data_delta)
+ struct trace_module_delta *module_delta;
+ struct trace_scratch *tscratch;
+
+ if (!(tr->flags & TRACE_ARRAY_FL_BOOT))
+ return;
+
+ if (!(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
return;
+ /* Only if the buffer has previous boot data clear and update it. */
+ tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT;
+
+ /* Reset the module list and reload them */
+ if (tr->scratch) {
+ struct trace_scratch *tscratch = tr->scratch;
+
+ memset(tscratch->entries, 0,
+ flex_array_size(tscratch, entries, tscratch->nr_entries));
+ tscratch->nr_entries = 0;
+
+ guard(mutex)(&scratch_mutex);
+ module_for_each_mod(save_mod, tr);
+ }
+
/*
* Need to clear all CPU buffers as there cannot be events
* from the previous boot mixed with events with this boot
@@ -6003,7 +6136,17 @@ static void update_last_data(struct trace_array *tr)
/* Using current data now */
tr->text_delta = 0;
- tr->data_delta = 0;
+
+ if (!tr->scratch)
+ return;
+
+ tscratch = tr->scratch;
+ module_delta = READ_ONCE(tr->module_delta);
+ WRITE_ONCE(tr->module_delta, NULL);
+ kfree_rcu(module_delta, rcu);
+
+ /* Set the persistent ring buffer meta data to this address */
+ tscratch->text_addr = (unsigned long)_text;
}
/**
@@ -6812,19 +6955,102 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
-static ssize_t
-tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+#define LAST_BOOT_HEADER ((void *)1)
+
+static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct trace_array *tr = filp->private_data;
- struct seq_buf seq;
- char buf[64];
+ struct trace_array *tr = m->private;
+ struct trace_scratch *tscratch = tr->scratch;
+ unsigned int index = *pos;
+
+ (*pos)++;
+
+ if (*pos == 1)
+ return LAST_BOOT_HEADER;
+
+ /* Only show offsets of the last boot data */
+ if (!tscratch || !(tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ return NULL;
+
+ /* *pos 0 is for the header, 1 is for the first module */
+ index--;
+
+ if (index >= tscratch->nr_entries)
+ return NULL;
+
+ return &tscratch->entries[index];
+}
- seq_buf_init(&seq, buf, 64);
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+ mutex_lock(&scratch_mutex);
+
+ return l_next(m, NULL, pos);
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&scratch_mutex);
+}
- seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta);
- seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta);
+static void show_last_boot_header(struct seq_file *m, struct trace_array *tr)
+{
+ struct trace_scratch *tscratch = tr->scratch;
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, seq_buf_used(&seq));
+ /*
+ * Do not leak KASLR address. This only shows the KASLR address of
+ * the last boot. When the ring buffer is started, the LAST_BOOT
+ * flag gets cleared, and this should only report "current".
+ * Otherwise it shows the KASLR address from the previous boot which
+ * should not be the same as the current boot.
+ */
+ if (tscratch && (tr->flags & TRACE_ARRAY_FL_LAST_BOOT))
+ seq_printf(m, "%lx\t[kernel]\n", tscratch->text_addr);
+ else
+ seq_puts(m, "# Current\n");
+}
+
+static int l_show(struct seq_file *m, void *v)
+{
+ struct trace_array *tr = m->private;
+ struct trace_mod_entry *entry = v;
+
+ if (v == LAST_BOOT_HEADER) {
+ show_last_boot_header(m, tr);
+ return 0;
+ }
+
+ seq_printf(m, "%lx\t%s\n", entry->mod_addr, entry->mod_name);
+ return 0;
+}
+
+static const struct seq_operations last_boot_seq_ops = {
+ .start = l_start,
+ .next = l_next,
+ .stop = l_stop,
+ .show = l_show,
+};
+
+static int tracing_last_boot_open(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+ struct seq_file *m;
+ int ret;
+
+ ret = tracing_check_open_get_tr(tr);
+ if (ret)
+ return ret;
+
+ ret = seq_open(file, &last_boot_seq_ops);
+ if (ret) {
+ trace_array_put(tr);
+ return ret;
+ }
+
+ m = file->private_data;
+ m->private = tr;
+
+ return 0;
}
static int tracing_buffer_meta_open(struct inode *inode, struct file *filp)
@@ -7453,10 +7679,10 @@ static const struct file_operations trace_time_stamp_mode_fops = {
};
static const struct file_operations last_boot_fops = {
- .open = tracing_open_generic_tr,
- .read = tracing_last_boot_read,
- .llseek = generic_file_llseek,
- .release = tracing_release_generic_tr,
+ .open = tracing_last_boot_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = tracing_seq_release,
};
#ifdef CONFIG_TRACER_SNAPSHOT
@@ -9196,22 +9422,125 @@ static struct dentry *trace_instance_dir;
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
+#ifdef CONFIG_MODULES
+static int make_mod_delta(struct module *mod, void *data)
+{
+ struct trace_module_delta *module_delta;
+ struct trace_scratch *tscratch;
+ struct trace_mod_entry *entry;
+ struct trace_array *tr = data;
+ int i;
+
+ tscratch = tr->scratch;
+ module_delta = READ_ONCE(tr->module_delta);
+ for (i = 0; i < tscratch->nr_entries; i++) {
+ entry = &tscratch->entries[i];
+ if (strcmp(mod->name, entry->mod_name))
+ continue;
+ if (mod->state == MODULE_STATE_GOING)
+ module_delta->delta[i] = 0;
+ else
+ module_delta->delta[i] = (unsigned long)mod->mem[MOD_TEXT].base
+ - entry->mod_addr;
+ break;
+ }
+ return 0;
+}
+#else
+static int make_mod_delta(struct module *mod, void *data)
+{
+ return 0;
+}
+#endif
+
+static int mod_addr_comp(const void *a, const void *b, const void *data)
+{
+ const struct trace_mod_entry *e1 = a;
+ const struct trace_mod_entry *e2 = b;
+
+ return e1->mod_addr > e2->mod_addr ? 1 : -1;
+}
+
+static void setup_trace_scratch(struct trace_array *tr,
+ struct trace_scratch *tscratch, unsigned int size)
+{
+ struct trace_module_delta *module_delta;
+ struct trace_mod_entry *entry;
+ int i, nr_entries;
+
+ if (!tscratch)
+ return;
+
+ tr->scratch = tscratch;
+ tr->scratch_size = size;
+
+ if (tscratch->text_addr)
+ tr->text_delta = (unsigned long)_text - tscratch->text_addr;
+
+ if (struct_size(tscratch, entries, tscratch->nr_entries) > size)
+ goto reset;
+
+ /* Check if each module name is a valid string */
+ for (i = 0; i < tscratch->nr_entries; i++) {
+ int n;
+
+ entry = &tscratch->entries[i];
+
+ for (n = 0; n < MODULE_NAME_LEN; n++) {
+ if (entry->mod_name[n] == '\0')
+ break;
+ if (!isprint(entry->mod_name[n]))
+ goto reset;
+ }
+ if (n == MODULE_NAME_LEN)
+ goto reset;
+ }
+
+ /* Sort the entries so that we can find appropriate module from address. */
+ nr_entries = tscratch->nr_entries;
+ sort_r(tscratch->entries, nr_entries, sizeof(struct trace_mod_entry),
+ mod_addr_comp, NULL, NULL);
+
+ if (IS_ENABLED(CONFIG_MODULES)) {
+ module_delta = kzalloc(struct_size(module_delta, delta, nr_entries), GFP_KERNEL);
+ if (!module_delta) {
+ pr_info("module_delta allocation failed. Not able to decode module address.");
+ goto reset;
+ }
+ init_rcu_head(&module_delta->rcu);
+ } else
+ module_delta = NULL;
+ WRITE_ONCE(tr->module_delta, module_delta);
+
+ /* Scan modules to make text delta for modules. */
+ module_for_each_mod(make_mod_delta, tr);
+ return;
+ reset:
+ /* Invalid trace modules */
+ memset(tscratch, 0, size);
+}
+
static int
allocate_trace_buffer(struct trace_array *tr, struct array_buffer *buf, int size)
{
enum ring_buffer_flags rb_flags;
+ struct trace_scratch *tscratch;
+ unsigned int scratch_size = 0;
rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
buf->tr = tr;
if (tr->range_addr_start && tr->range_addr_size) {
+ /* Add scratch buffer to handle 128 modules */
buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
tr->range_addr_start,
- tr->range_addr_size);
+ tr->range_addr_size,
+ struct_size(tscratch, entries, 128));
+
+ tscratch = ring_buffer_meta_scratch(buf->buffer, &scratch_size);
+ setup_trace_scratch(tr, tscratch, scratch_size);
- ring_buffer_last_boot_delta(buf->buffer,
- &tr->text_delta, &tr->data_delta);
/*
* This is basically the same as a mapped buffer,
* with the same restrictions.
@@ -9284,6 +9613,9 @@ static void free_trace_buffers(struct trace_array *tr)
#ifdef CONFIG_TRACER_MAX_TRACE
free_trace_buffer(&tr->max_buffer);
#endif
+
+ if (tr->range_addr_start)
+ vunmap((void *)tr->range_addr_start);
}
static void init_trace_flags_index(struct trace_array *tr)
@@ -9445,6 +9777,7 @@ trace_array_create_systems(const char *name, const char *systems,
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree_const(tr->system_names);
+ kfree(tr->range_name);
kfree(tr->name);
kfree(tr);
@@ -9571,6 +9904,11 @@ static int __remove_instance(struct trace_array *tr)
free_trace_buffers(tr);
clear_tracing_err_log(tr);
+ if (tr->range_name) {
+ reserve_mem_release_by_name(tr->range_name);
+ kfree(tr->range_name);
+ }
+
for (i = 0; i < tr->nr_topts; i++) {
kfree(tr->topts[i].topts);
}
@@ -9892,6 +10230,24 @@ static void trace_module_remove_evals(struct module *mod)
static inline void trace_module_remove_evals(struct module *mod) { }
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
+static void trace_module_record(struct module *mod, bool add)
+{
+ struct trace_array *tr;
+ unsigned long flags;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ flags = tr->flags & (TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT);
+ /* Update any persistent trace array that has already been started */
+ if (flags == TRACE_ARRAY_FL_BOOT && add) {
+ guard(mutex)(&scratch_mutex);
+ save_mod(mod, tr);
+ } else if (flags & TRACE_ARRAY_FL_LAST_BOOT) {
+ /* Update delta if the module loaded in previous boot */
+ make_mod_delta(mod, tr);
+ }
+ }
+}
+
static int trace_module_notify(struct notifier_block *self,
unsigned long val, void *data)
{
@@ -9900,9 +10256,11 @@ static int trace_module_notify(struct notifier_block *self,
switch (val) {
case MODULE_STATE_COMING:
trace_module_add_evals(mod);
+ trace_module_record(mod, true);
break;
case MODULE_STATE_GOING:
trace_module_remove_evals(mod);
+ trace_module_record(mod, false);
break;
}
@@ -10368,6 +10726,7 @@ __init static void enable_instances(void)
bool traceoff = false;
char *flag_delim;
char *addr_delim;
+ char *rname __free(kfree) = NULL;
tok = strsep(&curr_str, ",");
@@ -10424,6 +10783,7 @@ __init static void enable_instances(void)
pr_warn("Failed to map boot instance %s to %s\n", name, tok);
continue;
}
+ rname = kstrdup(tok, GFP_KERNEL);
}
if (start) {
@@ -10459,8 +10819,8 @@ __init static void enable_instances(void)
* to it.
*/
if (start) {
- tr->flags |= TRACE_ARRAY_FL_BOOT;
- tr->ref++;
+ tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
+ tr->range_name = no_free_ptr(rname);
}
while ((tok = strsep(&curr_str, ","))) {
@@ -10704,6 +11064,9 @@ __init static int late_trace_init(void)
tracepoint_printk = 0;
}
+ if (traceoff_after_boot)
+ tracing_off();
+
tracing_set_default_clock();
clear_boot_tracer();
return 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9c21ba45b7af..c20f6bcc200a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -21,6 +21,7 @@
#include <linux/workqueue.h>
#include <linux/ctype.h>
#include <linux/once_lite.h>
+#include <linux/ftrace_regs.h>
#include "pid_list.h"
@@ -312,6 +313,11 @@ struct trace_func_repeats {
u64 ts_last_call;
};
+struct trace_module_delta {
+ struct rcu_head rcu;
+ long delta[];
+};
+
/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
@@ -348,8 +354,13 @@ struct trace_array {
unsigned int mapped;
unsigned long range_addr_start;
unsigned long range_addr_size;
+ char *range_name;
long text_delta;
- long data_delta;
+ struct trace_module_delta *module_delta;
+ void *scratch; /* pointer in persistent memory */
+ int scratch_size;
+
+ int buffer_disabled;
struct trace_pid_list __rcu *filtered_pids;
struct trace_pid_list __rcu *filtered_no_pids;
@@ -367,7 +378,6 @@ struct trace_array {
* CONFIG_TRACER_MAX_TRACE.
*/
arch_spinlock_t max_lock;
- int buffer_disabled;
#ifdef CONFIG_FTRACE_SYSCALLS
int sys_refcount_enter;
int sys_refcount_exit;
@@ -433,9 +443,10 @@ struct trace_array {
};
enum {
- TRACE_ARRAY_FL_GLOBAL = BIT(0),
- TRACE_ARRAY_FL_BOOT = BIT(1),
- TRACE_ARRAY_FL_MOD_INIT = BIT(2),
+ TRACE_ARRAY_FL_GLOBAL = BIT(0),
+ TRACE_ARRAY_FL_BOOT = BIT(1),
+ TRACE_ARRAY_FL_LAST_BOOT = BIT(2),
+ TRACE_ARRAY_FL_MOD_INIT = BIT(3),
};
#ifdef CONFIG_MODULES
@@ -462,6 +473,8 @@ extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
extern bool trace_clock_in_ns(struct trace_array *tr);
+extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long addr);
+
/*
* The global tracer (top) should be the first trace array added,
* but we check the flag anyway.
@@ -697,7 +710,8 @@ unsigned long trace_total_entries(struct trace_array *tr);
void trace_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
- unsigned int trace_ctx);
+ unsigned int trace_ctx,
+ struct ftrace_regs *regs);
void trace_graph_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
@@ -783,6 +797,8 @@ extern void trace_find_cmdline(int pid, char comm[]);
extern int trace_find_tgid(int pid);
extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
+extern int trace_events_enabled(struct trace_array *tr, const char *system);
+
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
extern unsigned long ftrace_number_of_pages;
@@ -897,6 +913,7 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
#define TRACE_GRAPH_PRINT_RETVAL 0x800
#define TRACE_GRAPH_PRINT_RETVAL_HEX 0x1000
#define TRACE_GRAPH_PRINT_RETADDR 0x2000
+#define TRACE_GRAPH_ARGS 0x4000
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
@@ -1714,7 +1731,7 @@ struct event_trigger_data {
unsigned long count;
int ref;
int flags;
- struct event_trigger_ops *ops;
+ const struct event_trigger_ops *ops;
struct event_command *cmd_ops;
struct event_filter __rcu *filter;
char *filter_str;
@@ -1959,7 +1976,7 @@ struct event_command {
int (*set_filter)(char *filter_str,
struct event_trigger_data *data,
struct trace_event_file *file);
- struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
+ const struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
};
/**
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index fbfb396905a6..ee40d4e6ad1c 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -61,8 +61,9 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
TRACE_FN,
F_STRUCT(
- __field_fn( unsigned long, ip )
- __field_fn( unsigned long, parent_ip )
+ __field_fn( unsigned long, ip )
+ __field_fn( unsigned long, parent_ip )
+ __dynamic_array( unsigned long, args )
),
F_printk(" %ps <-- %ps",
@@ -72,17 +73,18 @@ FTRACE_ENTRY_REG(function, ftrace_entry,
);
/* Function call entry */
-FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
+FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
TRACE_GRAPH_ENT,
F_STRUCT(
__field_struct( struct ftrace_graph_ent, graph_ent )
__field_packed( unsigned long, graph_ent, func )
- __field_packed( int, graph_ent, depth )
+ __field_packed( unsigned long, graph_ent, depth )
+ __dynamic_array(unsigned long, args )
),
- F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
+ F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth)
);
#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 82fd637cfc19..c08355c3ef32 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -478,7 +478,7 @@ static void eprobe_trigger_func(struct event_trigger_data *data,
__eprobe_trace_func(edata, rec);
}
-static struct event_trigger_ops eprobe_trigger_ops = {
+static const struct event_trigger_ops eprobe_trigger_ops = {
.trigger = eprobe_trigger_func,
.print = eprobe_trigger_print,
.init = eprobe_trigger_init,
@@ -507,8 +507,8 @@ static void eprobe_trigger_unreg_func(char *glob,
}
-static struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd,
- char *param)
+static const struct event_trigger_ops *eprobe_trigger_get_ops(char *cmd,
+ char *param)
{
return &eprobe_trigger_ops;
}
@@ -913,6 +913,8 @@ static int __trace_eprobe_create(int argc, const char *argv[])
}
if (argc - 2 > MAX_TRACE_ARGS) {
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_ARGS);
ret = -E2BIG;
goto error;
}
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 3ff9caa4a71b..a6bb7577e8c5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -49,7 +49,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event)) {
- ret = perf_allow_tracepoint(&p_event->attr);
+ ret = perf_allow_tracepoint();
if (ret)
return ret;
@@ -86,7 +86,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
* ...otherwise raw tracepoint data can be a severe data leak,
* only allow root to have these.
*/
- ret = perf_allow_tracepoint(&p_event->attr);
+ ret = perf_allow_tracepoint();
if (ret)
return ret;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 513de9ceb80e..8638b7f7ff85 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -790,7 +790,9 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
}
- call->class->reg(call, TRACE_REG_UNREGISTER, file);
+ ret = call->class->reg(call, TRACE_REG_UNREGISTER, file);
+
+ WARN_ON_ONCE(ret);
}
/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
if (file->flags & EVENT_FILE_FL_SOFT_MODE)
@@ -1818,28 +1820,28 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
return cnt;
}
-static ssize_t
-system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
- loff_t *ppos)
+/*
+ * Returns:
+ * 0 : no events exist?
+ * 1 : all events are disabled
+ * 2 : all events are enabled
+ * 3 : some events are enabled and some are enabled
+ */
+int trace_events_enabled(struct trace_array *tr, const char *system)
{
- const char set_to_char[4] = { '?', '0', '1', 'X' };
- struct trace_subsystem_dir *dir = filp->private_data;
- struct event_subsystem *system = dir->subsystem;
struct trace_event_call *call;
struct trace_event_file *file;
- struct trace_array *tr = dir->tr;
- char buf[2];
int set = 0;
- int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
+
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
!trace_event_name(call) || !call->class || !call->class->reg)
continue;
- if (system && strcmp(call->class->system, system->name) != 0)
+ if (system && strcmp(call->class->system, system) != 0)
continue;
/*
@@ -1855,7 +1857,23 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
if (set == 3)
break;
}
- mutex_unlock(&event_mutex);
+
+ return set;
+}
+
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ const char set_to_char[4] = { '?', '0', '1', 'X' };
+ struct trace_subsystem_dir *dir = filp->private_data;
+ struct event_subsystem *system = dir->subsystem;
+ struct trace_array *tr = dir->tr;
+ char buf[2];
+ int set;
+ int ret;
+
+ set = trace_events_enabled(tr, system ? system->name : NULL);
buf[0] = set_to_char[set];
buf[1] = '\n';
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 53dc6719181e..1260c23cfa5f 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -6203,7 +6203,7 @@ static void event_hist_trigger_free(struct event_trigger_data *data)
}
}
-static struct event_trigger_ops event_hist_trigger_ops = {
+static const struct event_trigger_ops event_hist_trigger_ops = {
.trigger = event_hist_trigger,
.print = event_hist_trigger_print,
.init = event_hist_trigger_init,
@@ -6235,15 +6235,15 @@ static void event_hist_trigger_named_free(struct event_trigger_data *data)
}
}
-static struct event_trigger_ops event_hist_trigger_named_ops = {
+static const struct event_trigger_ops event_hist_trigger_named_ops = {
.trigger = event_hist_trigger,
.print = event_hist_trigger_print,
.init = event_hist_trigger_named_init,
.free = event_hist_trigger_named_free,
};
-static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
- char *param)
+static const struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
+ char *param)
{
return &event_hist_trigger_ops;
}
@@ -6838,38 +6838,38 @@ hist_enable_count_trigger(struct event_trigger_data *data,
hist_enable_trigger(data, buffer, rec, event);
}
-static struct event_trigger_ops hist_enable_trigger_ops = {
+static const struct event_trigger_ops hist_enable_trigger_ops = {
.trigger = hist_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops hist_enable_count_trigger_ops = {
+static const struct event_trigger_ops hist_enable_count_trigger_ops = {
.trigger = hist_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops hist_disable_trigger_ops = {
+static const struct event_trigger_ops hist_disable_trigger_ops = {
.trigger = hist_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops hist_disable_count_trigger_ops = {
+static const struct event_trigger_ops hist_disable_count_trigger_ops = {
.trigger = hist_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops *
+static const struct event_trigger_ops *
hist_enable_get_trigger_ops(char *cmd, char *param)
{
- struct event_trigger_ops *ops;
+ const struct event_trigger_ops *ops;
bool enable;
enable = (strcmp(cmd, ENABLE_HIST_STR) == 0);
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index e3f7d09e5512..969f48742d72 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -207,7 +207,7 @@ static int synth_field_string_size(char *type)
if (len == 0)
return 0; /* variable-length string */
- strncpy(buf, start, len);
+ memcpy(buf, start, len);
buf[len] = '\0';
err = kstrtouint(buf, 0, &size);
@@ -305,7 +305,7 @@ static const char *synth_field_fmt(char *type)
else if (strcmp(type, "gfp_t") == 0)
fmt = "%x";
else if (synth_field_is_string(type))
- fmt = "%.*s";
+ fmt = "%s";
else if (synth_field_is_stack(type))
fmt = "%s";
@@ -612,7 +612,7 @@ static int __set_synth_event_print_fmt(struct synth_event *event,
fmt = synth_field_fmt(event->fields[i]->type);
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
event->fields[i]->name, fmt,
- i == event->n_fields - 1 ? "" : ", ");
+ i == event->n_fields - 1 ? "" : " ");
}
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
@@ -852,6 +852,38 @@ static struct trace_event_fields synth_event_fields_array[] = {
{}
};
+static int synth_event_reg(struct trace_event_call *call,
+ enum trace_reg type, void *data)
+{
+ struct synth_event *event = container_of(call, struct synth_event, call);
+
+ switch (type) {
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+#endif
+ case TRACE_REG_REGISTER:
+ if (!try_module_get(event->mod))
+ return -EBUSY;
+ break;
+ default:
+ break;
+ }
+
+ int ret = trace_event_reg(call, type, data);
+
+ switch (type) {
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_UNREGISTER:
+#endif
+ case TRACE_REG_UNREGISTER:
+ module_put(event->mod);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
static int register_synth_event(struct synth_event *event)
{
struct trace_event_call *call = &event->call;
@@ -881,7 +913,7 @@ static int register_synth_event(struct synth_event *event)
goto out;
}
call->flags = TRACE_EVENT_FL_TRACEPOINT;
- call->class->reg = trace_event_reg;
+ call->class->reg = synth_event_reg;
call->class->probe = trace_event_raw_event_synth;
call->data = event;
call->tp = event->tp;
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d45448947094..b66b6d235d91 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -825,7 +825,7 @@ struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops,
void *private_data)
{
struct event_trigger_data *trigger_data;
- struct event_trigger_ops *trigger_ops;
+ const struct event_trigger_ops *trigger_ops;
trigger_ops = cmd_ops->get_trigger_ops(cmd, param);
@@ -1367,38 +1367,38 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_data *data)
data->filter_str);
}
-static struct event_trigger_ops traceon_trigger_ops = {
+static const struct event_trigger_ops traceon_trigger_ops = {
.trigger = traceon_trigger,
.print = traceon_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops traceon_count_trigger_ops = {
+static const struct event_trigger_ops traceon_count_trigger_ops = {
.trigger = traceon_count_trigger,
.print = traceon_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops traceoff_trigger_ops = {
+static const struct event_trigger_ops traceoff_trigger_ops = {
.trigger = traceoff_trigger,
.print = traceoff_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops traceoff_count_trigger_ops = {
+static const struct event_trigger_ops traceoff_count_trigger_ops = {
.trigger = traceoff_count_trigger,
.print = traceoff_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops *
+static const struct event_trigger_ops *
onoff_get_trigger_ops(char *cmd, char *param)
{
- struct event_trigger_ops *ops;
+ const struct event_trigger_ops *ops;
/* we register both traceon and traceoff to this callback */
if (strcmp(cmd, "traceon") == 0)
@@ -1491,21 +1491,21 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_data *data)
data->filter_str);
}
-static struct event_trigger_ops snapshot_trigger_ops = {
+static const struct event_trigger_ops snapshot_trigger_ops = {
.trigger = snapshot_trigger,
.print = snapshot_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops snapshot_count_trigger_ops = {
+static const struct event_trigger_ops snapshot_count_trigger_ops = {
.trigger = snapshot_count_trigger,
.print = snapshot_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops *
+static const struct event_trigger_ops *
snapshot_get_trigger_ops(char *cmd, char *param)
{
return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops;
@@ -1586,21 +1586,21 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_data *data)
data->filter_str);
}
-static struct event_trigger_ops stacktrace_trigger_ops = {
+static const struct event_trigger_ops stacktrace_trigger_ops = {
.trigger = stacktrace_trigger,
.print = stacktrace_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops stacktrace_count_trigger_ops = {
+static const struct event_trigger_ops stacktrace_count_trigger_ops = {
.trigger = stacktrace_count_trigger,
.print = stacktrace_trigger_print,
.init = event_trigger_init,
.free = event_trigger_free,
};
-static struct event_trigger_ops *
+static const struct event_trigger_ops *
stacktrace_get_trigger_ops(char *cmd, char *param)
{
return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops;
@@ -1711,28 +1711,28 @@ void event_enable_trigger_free(struct event_trigger_data *data)
}
}
-static struct event_trigger_ops event_enable_trigger_ops = {
+static const struct event_trigger_ops event_enable_trigger_ops = {
.trigger = event_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops event_enable_count_trigger_ops = {
+static const struct event_trigger_ops event_enable_count_trigger_ops = {
.trigger = event_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops event_disable_trigger_ops = {
+static const struct event_trigger_ops event_disable_trigger_ops = {
.trigger = event_enable_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
.free = event_enable_trigger_free,
};
-static struct event_trigger_ops event_disable_count_trigger_ops = {
+static const struct event_trigger_ops event_disable_count_trigger_ops = {
.trigger = event_enable_count_trigger,
.print = event_enable_trigger_print,
.init = event_trigger_init,
@@ -1916,10 +1916,10 @@ void event_enable_unregister_trigger(char *glob,
data->ops->free(data);
}
-static struct event_trigger_ops *
+static const struct event_trigger_ops *
event_enable_get_trigger_ops(char *cmd, char *param)
{
- struct event_trigger_ops *ops;
+ const struct event_trigger_ops *ops;
bool enable;
#ifdef CONFIG_HIST_TRIGGERS
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 97325fbd6283..af42aaa3d172 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -455,7 +455,7 @@ static void user_event_enabler_fault_fixup(struct work_struct *work)
if (ret && ret != -ENOENT) {
struct user_event *user = enabler->event;
- pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n",
+ pr_warn("user_events: Fault for mm: 0x%p @ 0x%llx event: %s\n",
mm->mm, (unsigned long long)uaddr, EVENT_NAME(user));
}
@@ -2793,11 +2793,8 @@ static int user_seq_show(struct seq_file *m, void *p)
seq_printf(m, "%s", EVENT_TP_NAME(user));
- if (status != 0)
- seq_puts(m, " #");
-
if (status != 0) {
- seq_puts(m, " Used by");
+ seq_puts(m, " # Used by");
if (status & EVENT_STATUS_FTRACE)
seq_puts(m, " ftrace");
if (status & EVENT_STATUS_PERF)
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index e27305d31fc5..5d7ca80173ea 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -920,13 +920,8 @@ static void __find_tracepoint_module_cb(struct tracepoint *tp, struct module *mo
if (!data->tpoint && !strcmp(data->tp_name, tp->name)) {
data->tpoint = tp;
- if (!data->mod) {
+ if (!data->mod)
data->mod = mod;
- if (!try_module_get(data->mod)) {
- data->tpoint = NULL;
- data->mod = NULL;
- }
- }
}
}
@@ -938,13 +933,7 @@ static void __find_tracepoint_cb(struct tracepoint *tp, void *priv)
data->tpoint = tp;
}
-/*
- * Find a tracepoint from kernel and module. If the tracepoint is in a module,
- * this increments the module refcount to prevent unloading until the
- * trace_fprobe is registered to the list. After registering the trace_fprobe
- * on the trace_fprobe list, the module refcount is decremented because
- * tracepoint_probe_module_cb will handle it.
- */
+/* Find a tracepoint from kernel and module. */
static struct tracepoint *find_tracepoint(const char *tp_name,
struct module **tp_mod)
{
@@ -973,6 +962,7 @@ static void reenable_trace_fprobe(struct trace_fprobe *tf)
}
}
+/* Find a tracepoint from specified module. */
static struct tracepoint *find_tracepoint_in_module(struct module *mod,
const char *tp_name)
{
@@ -1008,10 +998,13 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self,
reenable_trace_fprobe(tf);
}
} else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) {
- tracepoint_probe_unregister(tf->tpoint,
+ unregister_fprobe(&tf->fp);
+ if (trace_fprobe_is_tracepoint(tf)) {
+ tracepoint_probe_unregister(tf->tpoint,
tf->tpoint->probestub, NULL);
- tf->tpoint = NULL;
- tf->mod = NULL;
+ tf->tpoint = TRACEPOINT_STUB;
+ tf->mod = NULL;
+ }
}
}
mutex_unlock(&event_mutex);
@@ -1176,6 +1169,11 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
if (is_tracepoint) {
ctx->flags |= TPARG_FL_TPOINT;
tpoint = find_tracepoint(symbol, &tp_mod);
+ /* lock module until register this tprobe. */
+ if (tp_mod && !try_module_get(tp_mod)) {
+ tpoint = NULL;
+ tp_mod = NULL;
+ }
if (tpoint) {
ctx->funcname = kallsyms_lookup(
(unsigned long)tpoint->probestub,
@@ -1201,8 +1199,11 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
argc = new_argc;
argv = new_argv;
}
- if (argc > MAX_TRACE_ARGS)
+ if (argc > MAX_TRACE_ARGS) {
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_ARGS);
return -E2BIG;
+ }
ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
if (ret)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index df56f9b76010..98ccf3f00c51 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -25,6 +25,9 @@ static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
static void
+function_args_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs);
+static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs);
static void
@@ -42,9 +45,10 @@ enum {
TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */
TRACE_FUNC_OPT_STACK = 0x1,
TRACE_FUNC_OPT_NO_REPEATS = 0x2,
+ TRACE_FUNC_OPT_ARGS = 0x4,
/* Update this to next highest bit. */
- TRACE_FUNC_OPT_HIGHEST_BIT = 0x4
+ TRACE_FUNC_OPT_HIGHEST_BIT = 0x8
};
#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1)
@@ -114,6 +118,8 @@ static ftrace_func_t select_trace_function(u32 flags_val)
switch (flags_val & TRACE_FUNC_OPT_MASK) {
case TRACE_FUNC_NO_OPTS:
return function_trace_call;
+ case TRACE_FUNC_OPT_ARGS:
+ return function_args_trace_call;
case TRACE_FUNC_OPT_STACK:
return function_stack_trace_call;
case TRACE_FUNC_OPT_NO_REPEATS:
@@ -220,7 +226,34 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
data = this_cpu_ptr(tr->array_buffer.data);
if (!atomic_read(&data->disabled))
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+
+ ftrace_test_recursion_unlock(bit);
+}
+
+static void
+function_args_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct ftrace_regs *fregs)
+{
+ struct trace_array *tr = op->private;
+ struct trace_array_cpu *data;
+ unsigned int trace_ctx;
+ int bit;
+ int cpu;
+
+ if (unlikely(!tr->function_enabled))
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ trace_ctx = tracing_gen_ctx();
+
+ cpu = smp_processor_id();
+ data = per_cpu_ptr(tr->array_buffer.data, cpu);
+ if (!atomic_read(&data->disabled))
+ trace_function(tr, ip, parent_ip, trace_ctx, fregs);
ftrace_test_recursion_unlock(bit);
}
@@ -270,7 +303,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (likely(disabled == 1)) {
trace_ctx = tracing_gen_ctx_flags(flags);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
#ifdef CONFIG_UNWINDER_FRAME_POINTER
if (ftrace_pids_enabled(op))
skip++;
@@ -349,7 +382,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx_dec();
process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
out:
ftrace_test_recursion_unlock(bit);
@@ -389,7 +422,7 @@ function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx_flags(flags);
process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
__trace_stack(tr, trace_ctx, STACK_SKIP);
}
@@ -403,6 +436,9 @@ static struct tracer_opt func_opts[] = {
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
{ TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) },
+#ifdef CONFIG_FUNCTION_TRACE_ARGS
+ { TRACER_OPT(func-args, TRACE_FUNC_OPT_ARGS) },
+#endif
{ } /* Always set a last empty entry */
};
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 136c750b0b4d..2f077d4158e5 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -71,6 +71,10 @@ static struct tracer_opt trace_opts[] = {
/* Display function return address ? */
{ TRACER_OPT(funcgraph-retaddr, TRACE_GRAPH_PRINT_RETADDR) },
#endif
+#ifdef CONFIG_FUNCTION_TRACE_ARGS
+ /* Display function arguments ? */
+ { TRACER_OPT(funcgraph-args, TRACE_GRAPH_ARGS) },
+#endif
/* Include sleep time (scheduled out) between entry and return */
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
@@ -110,25 +114,43 @@ static void
print_graph_duration(struct trace_array *tr, unsigned long long duration,
struct trace_seq *s, u32 flags);
-int __trace_graph_entry(struct trace_array *tr,
- struct ftrace_graph_ent *trace,
- unsigned int trace_ctx)
+static int __graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx, struct ftrace_regs *fregs)
{
struct ring_buffer_event *event;
struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ftrace_graph_ent_entry *entry;
+ int size;
- event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
- sizeof(*entry), trace_ctx);
+ /* If fregs is defined, add FTRACE_REGS_MAX_ARGS long size words */
+ size = sizeof(*entry) + (FTRACE_REGS_MAX_ARGS * !!fregs * sizeof(long));
+
+ event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, size, trace_ctx);
if (!event)
return 0;
- entry = ring_buffer_event_data(event);
- entry->graph_ent = *trace;
+
+ entry = ring_buffer_event_data(event);
+ entry->graph_ent = *trace;
+
+#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+ if (fregs) {
+ for (int i = 0; i < FTRACE_REGS_MAX_ARGS; i++)
+ entry->args[i] = ftrace_regs_get_argument(fregs, i);
+ }
+#endif
+
trace_buffer_unlock_commit_nostack(buffer, event);
return 1;
}
+int __trace_graph_entry(struct trace_array *tr,
+ struct ftrace_graph_ent *trace,
+ unsigned int trace_ctx)
+{
+ return __graph_entry(tr, trace, trace_ctx, NULL);
+}
+
#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
int __trace_graph_retaddr_entry(struct trace_array *tr,
struct ftrace_graph_ent *trace,
@@ -174,9 +196,9 @@ struct fgraph_times {
unsigned long long sleeptime; /* may be optional! */
};
-int trace_graph_entry(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops,
- struct ftrace_regs *fregs)
+static int graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
@@ -246,7 +268,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
unsigned long retaddr = ftrace_graph_top_ret_addr(current);
ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr);
} else {
- ret = __trace_graph_entry(tr, trace, trace_ctx);
+ ret = __graph_entry(tr, trace, trace_ctx, fregs);
}
}
preempt_enable_notrace();
@@ -254,6 +276,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
return ret;
}
+int trace_graph_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ return graph_entry(trace, gops, NULL);
+}
+
+static int trace_graph_entry_args(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ return graph_entry(trace, gops, fregs);
+}
+
static void
__trace_graph_function(struct trace_array *tr,
unsigned long ip, unsigned int trace_ctx)
@@ -418,14 +454,17 @@ static int graph_trace_init(struct trace_array *tr)
{
int ret;
- tr->gops->entryfunc = trace_graph_entry;
+ if (tracer_flags_is_set(TRACE_GRAPH_ARGS))
+ tr->gops->entryfunc = trace_graph_entry_args;
+ else
+ tr->gops->entryfunc = trace_graph_entry;
if (tracing_thresh)
tr->gops->retfunc = trace_graph_thresh_return;
else
tr->gops->retfunc = trace_graph_return;
- /* Make gops functions are visible before we start tracing */
+ /* Make gops functions visible before we start tracing */
smp_mb();
ret = register_ftrace_graph(tr->gops);
@@ -436,6 +475,28 @@ static int graph_trace_init(struct trace_array *tr)
return 0;
}
+static int ftrace_graph_trace_args(struct trace_array *tr, int set)
+{
+ trace_func_graph_ent_t entry;
+
+ if (set)
+ entry = trace_graph_entry_args;
+ else
+ entry = trace_graph_entry;
+
+ /* See if there's any changes */
+ if (tr->gops->entryfunc == entry)
+ return 0;
+
+ unregister_ftrace_graph(tr->gops);
+
+ tr->gops->entryfunc = entry;
+
+ /* Make gops functions visible before we start tracing */
+ smp_mb();
+ return register_ftrace_graph(tr->gops);
+}
+
static void graph_trace_reset(struct trace_array *tr)
{
tracing_stop_cmdline_record();
@@ -775,7 +836,7 @@ static void print_graph_retaddr(struct trace_seq *s, struct fgraph_retaddr_ent_e
static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entry *entry,
struct ftrace_graph_ret *graph_ret, void *func,
- u32 opt_flags, u32 trace_flags)
+ u32 opt_flags, u32 trace_flags, int args_size)
{
unsigned long err_code = 0;
unsigned long retval = 0;
@@ -809,7 +870,14 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr
if (entry->ent.type != TRACE_GRAPH_RETADDR_ENT)
print_retaddr = false;
- trace_seq_printf(s, "%ps();", func);
+ trace_seq_printf(s, "%ps", func);
+
+ if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) {
+ print_function_args(s, entry->args, (unsigned long)func);
+ trace_seq_putc(s, ';');
+ } else
+ trace_seq_puts(s, "();");
+
if (print_retval || print_retaddr)
trace_seq_puts(s, " /*");
else
@@ -836,7 +904,8 @@ static void print_graph_retval(struct trace_seq *s, struct ftrace_graph_ent_entr
#else
-#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags) do {} while (0)
+#define print_graph_retval(_seq, _ent, _ret, _func, _opt_flags, _trace_flags, args_size) \
+ do {} while (0)
#endif
@@ -852,16 +921,17 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
- unsigned long func;
+ unsigned long ret_func;
+ int args_size;
int cpu = iter->cpu;
int i;
+ args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args);
+
graph_ret = &ret_entry->ret;
call = &entry->graph_ent;
duration = ret_entry->rettime - ret_entry->calltime;
- func = call->func + iter->tr->text_delta;
-
if (data) {
struct fgraph_cpu_data *cpu_data;
@@ -887,16 +957,25 @@ print_graph_entry_leaf(struct trace_iterator *iter,
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
trace_seq_putc(s, ' ');
+ ret_func = graph_ret->func + iter->tr->text_delta;
+
/*
* Write out the function return value or return address
*/
if (flags & (__TRACE_GRAPH_PRINT_RETVAL | __TRACE_GRAPH_PRINT_RETADDR)) {
print_graph_retval(s, entry, graph_ret,
(void *)graph_ret->func + iter->tr->text_delta,
- flags, tr->trace_flags);
+ flags, tr->trace_flags, args_size);
} else {
- trace_seq_printf(s, "%ps();\n", (void *)func);
+ trace_seq_printf(s, "%ps", (void *)ret_func);
+
+ if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long)) {
+ print_function_args(s, entry->args, ret_func);
+ trace_seq_putc(s, ';');
+ } else
+ trace_seq_puts(s, "();");
}
+ trace_seq_printf(s, "\n");
print_graph_irq(iter, graph_ret->func, TRACE_GRAPH_RET,
cpu, iter->ent->pid, flags);
@@ -913,6 +992,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
unsigned long func;
+ int args_size;
int i;
if (data) {
@@ -937,7 +1017,17 @@ print_graph_entry_nested(struct trace_iterator *iter,
func = call->func + iter->tr->text_delta;
- trace_seq_printf(s, "%ps() {", (void *)func);
+ trace_seq_printf(s, "%ps", (void *)func);
+
+ args_size = iter->ent_size - offsetof(struct ftrace_graph_ent_entry, args);
+
+ if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long))
+ print_function_args(s, entry->args, func);
+ else
+ trace_seq_puts(s, "()");
+
+ trace_seq_puts(s, " {");
+
if (flags & __TRACE_GRAPH_PRINT_RETADDR &&
entry->ent.type == TRACE_GRAPH_RETADDR_ENT)
print_graph_retaddr(s, (struct fgraph_retaddr_ent_entry *)entry,
@@ -1107,21 +1197,38 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
struct trace_iterator *iter, u32 flags)
{
struct fgraph_data *data = iter->private;
- struct ftrace_graph_ent *call = &field->graph_ent;
+ struct ftrace_graph_ent *call;
struct ftrace_graph_ret_entry *leaf_ret;
static enum print_line_t ret;
int cpu = iter->cpu;
+ /*
+ * print_graph_entry() may consume the current event,
+ * thus @field may become invalid, so we need to save it.
+ * sizeof(struct ftrace_graph_ent_entry) is very small,
+ * it can be safely saved at the stack.
+ */
+ struct ftrace_graph_ent_entry *entry;
+ u8 save_buf[sizeof(*entry) + FTRACE_REGS_MAX_ARGS * sizeof(long)];
+
+ /* The ent_size is expected to be as big as the entry */
+ if (iter->ent_size > sizeof(save_buf))
+ iter->ent_size = sizeof(save_buf);
+
+ entry = (void *)save_buf;
+ memcpy(entry, field, iter->ent_size);
+
+ call = &entry->graph_ent;
if (check_irq_entry(iter, flags, call->func, call->depth))
return TRACE_TYPE_HANDLED;
print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags);
- leaf_ret = get_return_for_leaf(iter, field);
+ leaf_ret = get_return_for_leaf(iter, entry);
if (leaf_ret)
- ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
+ ret = print_graph_entry_leaf(iter, entry, leaf_ret, s, flags);
else
- ret = print_graph_entry_nested(iter, field, s, cpu, flags);
+ ret = print_graph_entry_nested(iter, entry, s, cpu, flags);
if (data) {
/*
@@ -1195,7 +1302,8 @@ print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s,
* funcgraph-retval option is enabled.
*/
if (flags & __TRACE_GRAPH_PRINT_RETVAL) {
- print_graph_retval(s, NULL, trace, (void *)func, flags, tr->trace_flags);
+ print_graph_retval(s, NULL, trace, (void *)func, flags,
+ tr->trace_flags, 0);
} else {
/*
* If the return function does not have a matching entry,
@@ -1323,16 +1431,8 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
switch (entry->type) {
case TRACE_GRAPH_ENT: {
- /*
- * print_graph_entry() may consume the current event,
- * thus @field may become invalid, so we need to save it.
- * sizeof(struct ftrace_graph_ent_entry) is very small,
- * it can be safely saved at the stack.
- */
- struct ftrace_graph_ent_entry saved;
trace_assign_type(field, entry);
- saved = *field;
- return print_graph_entry(&saved, s, iter, flags);
+ return print_graph_entry(field, s, iter, flags);
}
#ifdef CONFIG_FUNCTION_GRAPH_RETADDR
case TRACE_GRAPH_RETADDR_ENT: {
@@ -1511,6 +1611,7 @@ void graph_trace_close(struct trace_iterator *iter)
if (data) {
free_percpu(data->cpu_data);
kfree(data);
+ iter->private = NULL;
}
}
@@ -1526,6 +1627,9 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
if (bit == TRACE_GRAPH_GRAPH_TIME)
ftrace_graph_graph_time_control(set);
+ if (bit == TRACE_GRAPH_ARGS)
+ return ftrace_graph_trace_args(tr, set);
+
return 0;
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7294ad676379..40c39e946940 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -150,7 +150,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx_flags(flags);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, fregs);
atomic_dec(&data->disabled);
}
@@ -250,8 +250,6 @@ static void irqsoff_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
- else
- iter->private = NULL;
}
static void irqsoff_trace_close(struct trace_iterator *iter)
@@ -295,11 +293,17 @@ __trace_function(struct trace_array *tr,
if (is_graph(tr))
trace_graph_function(tr, ip, parent_ip, trace_ctx);
else
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
}
#else
-#define __trace_function trace_function
+static inline void
+__trace_function(struct trace_array *tr,
+ unsigned long ip, unsigned long parent_ip,
+ unsigned int trace_ctx)
+{
+ return trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+}
static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
{
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d8d5f18a141a..2703b96d8990 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -124,9 +124,8 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
if (!p)
return true;
*p = '\0';
- rcu_read_lock_sched();
- ret = !!find_module(tk->symbol);
- rcu_read_unlock_sched();
+ scoped_guard(rcu)
+ ret = !!find_module(tk->symbol);
*p = ':';
return ret;
@@ -796,12 +795,10 @@ static struct module *try_module_get_by_name(const char *name)
{
struct module *mod;
- rcu_read_lock_sched();
+ guard(rcu)();
mod = find_module(name);
if (mod && !try_module_get(mod))
mod = NULL;
- rcu_read_unlock_sched();
-
return mod;
}
#else
@@ -1007,8 +1004,11 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
argc = new_argc;
argv = new_argv;
}
- if (argc > MAX_TRACE_ARGS)
+ if (argc > MAX_TRACE_ARGS) {
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_ARGS);
return -E2BIG;
+ }
ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
if (ret)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index f3a2722ee4c0..e732c9e37e14 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -316,33 +316,6 @@ static inline void osn_var_reset_all(void)
bool trace_osnoise_callback_enabled;
/*
- * osnoise sample structure definition. Used to store the statistics of a
- * sample run.
- */
-struct osnoise_sample {
- u64 runtime; /* runtime */
- u64 noise; /* noise */
- u64 max_sample; /* max single noise sample */
- int hw_count; /* # HW (incl. hypervisor) interference */
- int nmi_count; /* # NMIs during this sample */
- int irq_count; /* # IRQs during this sample */
- int softirq_count; /* # softirqs during this sample */
- int thread_count; /* # threads during this sample */
-};
-
-#ifdef CONFIG_TIMERLAT_TRACER
-/*
- * timerlat sample structure definition. Used to store the statistics of
- * a sample run.
- */
-struct timerlat_sample {
- u64 timer_latency; /* timer_latency */
- unsigned int seqnum; /* unique sequence */
- int context; /* timer context */
-};
-#endif
-
-/*
* Tracer data.
*/
static struct osnoise_data {
@@ -497,7 +470,7 @@ static void print_osnoise_headers(struct seq_file *s)
* Record an osnoise_sample into the tracer buffer.
*/
static void
-__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
+__record_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
{
struct ring_buffer_event *event;
struct osnoise_entry *entry;
@@ -520,17 +493,19 @@ __trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffe
}
/*
- * Record an osnoise_sample on all osnoise instances.
+ * Record an osnoise_sample on all osnoise instances and fire trace event.
*/
-static void trace_osnoise_sample(struct osnoise_sample *sample)
+static void record_osnoise_sample(struct osnoise_sample *sample)
{
struct osnoise_instance *inst;
struct trace_buffer *buffer;
+ trace_osnoise_sample(sample);
+
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
buffer = inst->tr->array_buffer.buffer;
- __trace_osnoise_sample(sample, buffer);
+ __record_osnoise_sample(sample, buffer);
}
rcu_read_unlock();
}
@@ -574,7 +549,7 @@ static void print_timerlat_headers(struct seq_file *s)
#endif /* CONFIG_PREEMPT_RT */
static void
-__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
+__record_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
{
struct ring_buffer_event *event;
struct timerlat_entry *entry;
@@ -594,15 +569,17 @@ __trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buf
/*
* Record an timerlat_sample into the tracer buffer.
*/
-static void trace_timerlat_sample(struct timerlat_sample *sample)
+static void record_timerlat_sample(struct timerlat_sample *sample)
{
struct osnoise_instance *inst;
struct trace_buffer *buffer;
+ trace_timerlat_sample(sample);
+
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
buffer = inst->tr->array_buffer.buffer;
- __trace_timerlat_sample(sample, buffer);
+ __record_timerlat_sample(sample, buffer);
}
rcu_read_unlock();
}
@@ -1542,27 +1519,25 @@ static int run_osnoise(void)
/*
* In some cases, notably when running on a nohz_full CPU with
- * a stopped tick PREEMPT_RCU has no way to account for QSs.
- * This will eventually cause unwarranted noise as PREEMPT_RCU
- * will force preemption as the means of ending the current
- * grace period. We avoid this problem by calling
- * rcu_momentary_eqs(), which performs a zero duration
- * EQS allowing PREEMPT_RCU to end the current grace period.
- * This call shouldn't be wrapped inside an RCU critical
- * section.
+ * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to
+ * account for QSs. This will eventually cause unwarranted
+ * noise as RCU forces preemption as the means of ending the
+ * current grace period. We avoid this by calling
+ * rcu_momentary_eqs(), which performs a zero duration EQS
+ * allowing RCU to end the current grace period. This call
+ * shouldn't be wrapped inside an RCU critical section.
*
- * Note that in non PREEMPT_RCU kernels QSs are handled through
- * cond_resched()
+ * Normally QSs for other cases are handled through cond_resched().
+ * For simplicity, however, we call rcu_momentary_eqs() for all
+ * configurations here.
*/
- if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
- if (!disable_irq)
- local_irq_disable();
+ if (!disable_irq)
+ local_irq_disable();
- rcu_momentary_eqs();
+ rcu_momentary_eqs();
- if (!disable_irq)
- local_irq_enable();
- }
+ if (!disable_irq)
+ local_irq_enable();
/*
* For the non-preemptive kernel config: let threads runs, if
@@ -1608,7 +1583,7 @@ static int run_osnoise(void)
/* Save interference stats info */
diff_osn_sample_stats(osn_var, &s);
- trace_osnoise_sample(&s);
+ record_osnoise_sample(&s);
notify_new_max_latency(max_noise);
@@ -1803,7 +1778,7 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
s.timer_latency = diff;
s.context = IRQ_CONTEXT;
- trace_timerlat_sample(&s);
+ record_timerlat_sample(&s);
if (osnoise_data.stop_tracing) {
if (time_to_us(diff) >= osnoise_data.stop_tracing) {
@@ -1901,8 +1876,7 @@ static int timerlat_main(void *data)
tlat->count = 0;
tlat->tracing_thread = false;
- hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- tlat->timer.function = timerlat_irq;
+ hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
tlat->kthread = current;
osn_var->pid = current->pid;
/*
@@ -1923,7 +1897,7 @@ static int timerlat_main(void *data)
s.timer_latency = diff;
s.context = THREAD_CONTEXT;
- trace_timerlat_sample(&s);
+ record_timerlat_sample(&s);
notify_new_max_latency(diff);
@@ -2032,7 +2006,6 @@ static int start_kthread(unsigned int cpu)
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
- stop_per_cpu_kthreads();
return -ENOMEM;
}
@@ -2456,8 +2429,7 @@ static int timerlat_fd_open(struct inode *inode, struct file *file)
tlat = this_cpu_tmr_var();
tlat->count = 0;
- hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- tlat->timer.function = timerlat_irq;
+ hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
migrate_enable();
return 0;
@@ -2529,7 +2501,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
s.timer_latency = diff;
s.context = THREAD_URET;
- trace_timerlat_sample(&s);
+ record_timerlat_sample(&s);
notify_new_max_latency(diff);
@@ -2564,7 +2536,7 @@ timerlat_fd_read(struct file *file, char __user *ubuf, size_t count,
s.timer_latency = diff;
s.context = THREAD_CONTEXT;
- trace_timerlat_sample(&s);
+ record_timerlat_sample(&s);
if (osnoise_data.stop_tracing_total) {
if (time_to_us(diff) >= osnoise_data.stop_tracing_total) {
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 03d56f711ad1..fee40ffbd490 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -5,6 +5,7 @@
* Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
*
*/
+#include "trace.h"
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/ftrace.h>
@@ -12,15 +13,19 @@
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
#include <linux/idr.h>
+#include <linux/btf.h>
+#include <linux/bpf.h>
+#include <linux/hashtable.h>
#include "trace_output.h"
+#include "trace_btf.h"
-/* must be a power of 2 */
-#define EVENT_HASHSIZE 128
+/* 2^7 = 128 */
+#define EVENT_HASH_BITS 7
DECLARE_RWSEM(trace_event_sem);
-static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+static DEFINE_HASHTABLE(event_hash, EVENT_HASH_BITS);
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
@@ -684,6 +689,88 @@ int trace_print_lat_context(struct trace_iterator *iter)
return !trace_seq_has_overflowed(s);
}
+#ifdef CONFIG_FUNCTION_TRACE_ARGS
+void print_function_args(struct trace_seq *s, unsigned long *args,
+ unsigned long func)
+{
+ const struct btf_param *param;
+ const struct btf_type *t;
+ const char *param_name;
+ char name[KSYM_NAME_LEN];
+ unsigned long arg;
+ struct btf *btf;
+ s32 tid, nr = 0;
+ int a, p, x;
+
+ trace_seq_printf(s, "(");
+
+ if (!args)
+ goto out;
+ if (lookup_symbol_name(func, name))
+ goto out;
+
+ /* TODO: Pass module name here too */
+ t = btf_find_func_proto(name, &btf);
+ if (IS_ERR_OR_NULL(t))
+ goto out;
+
+ param = btf_get_func_param(t, &nr);
+ if (!param)
+ goto out_put;
+
+ for (a = 0, p = 0; p < nr; a++, p++) {
+ if (p)
+ trace_seq_puts(s, ", ");
+
+ /* This only prints what the arch allows (6 args by default) */
+ if (a == FTRACE_REGS_MAX_ARGS) {
+ trace_seq_puts(s, "...");
+ break;
+ }
+
+ arg = args[a];
+
+ param_name = btf_name_by_offset(btf, param[p].name_off);
+ if (param_name)
+ trace_seq_printf(s, "%s=", param_name);
+ t = btf_type_skip_modifiers(btf, param[p].type, &tid);
+
+ switch (t ? BTF_INFO_KIND(t->info) : BTF_KIND_UNKN) {
+ case BTF_KIND_UNKN:
+ trace_seq_putc(s, '?');
+ /* Still print unknown type values */
+ fallthrough;
+ case BTF_KIND_PTR:
+ trace_seq_printf(s, "0x%lx", arg);
+ break;
+ case BTF_KIND_INT:
+ trace_seq_printf(s, "%ld", arg);
+ break;
+ case BTF_KIND_ENUM:
+ trace_seq_printf(s, "%ld", arg);
+ break;
+ default:
+ /* This does not handle complex arguments */
+ trace_seq_printf(s, "(%s)[0x%lx", btf_type_str(t), arg);
+ for (x = sizeof(long); x < t->size; x += sizeof(long)) {
+ trace_seq_putc(s, ':');
+ if (++a == FTRACE_REGS_MAX_ARGS) {
+ trace_seq_puts(s, "...]");
+ goto out_put;
+ }
+ trace_seq_printf(s, "0x%lx", args[a]);
+ }
+ trace_seq_putc(s, ']');
+ break;
+ }
+ }
+out_put:
+ btf_put(btf);
+out:
+ trace_seq_printf(s, ")");
+}
+#endif
+
/**
* ftrace_find_event - find a registered event
* @type: the type of event to look for
@@ -694,11 +781,8 @@ int trace_print_lat_context(struct trace_iterator *iter)
struct trace_event *ftrace_find_event(int type)
{
struct trace_event *event;
- unsigned key;
-
- key = type & (EVENT_HASHSIZE - 1);
- hlist_for_each_entry(event, &event_hash[key], node) {
+ hash_for_each_possible(event_hash, event, node, type) {
if (event->type == type)
return event;
}
@@ -753,7 +837,6 @@ void trace_event_read_unlock(void)
*/
int register_trace_event(struct trace_event *event)
{
- unsigned key;
int ret = 0;
down_write(&trace_event_sem);
@@ -786,9 +869,7 @@ int register_trace_event(struct trace_event *event)
if (event->funcs->binary == NULL)
event->funcs->binary = trace_nop_print;
- key = event->type & (EVENT_HASHSIZE - 1);
-
- hlist_add_head(&event->node, &event_hash[key]);
+ hash_add(event_hash, &event->node, event->type);
ret = event->type;
out:
@@ -803,7 +884,7 @@ EXPORT_SYMBOL_GPL(register_trace_event);
*/
int __unregister_trace_event(struct trace_event *event)
{
- hlist_del(&event->node);
+ hash_del(&event->node);
free_trace_event_type(event->type);
return 0;
}
@@ -1005,12 +1086,15 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
}
static void print_fn_trace(struct trace_seq *s, unsigned long ip,
- unsigned long parent_ip, long delta, int flags)
+ unsigned long parent_ip, long delta,
+ unsigned long *args, int flags)
{
ip += delta;
parent_ip += delta;
seq_print_ip_sym(s, ip, flags);
+ if (args)
+ print_function_args(s, args, ip);
if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
trace_seq_puts(s, " <-");
@@ -1024,10 +1108,19 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
{
struct ftrace_entry *field;
struct trace_seq *s = &iter->seq;
+ unsigned long *args;
+ int args_size;
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags);
+ args_size = iter->ent_size - offsetof(struct ftrace_entry, args);
+ if (args_size >= FTRACE_REGS_MAX_ARGS * sizeof(long))
+ args = field->args;
+ else
+ args = NULL;
+
+ print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta,
+ args, flags);
trace_seq_putc(s, '\n');
return trace_handle_return(s);
@@ -1248,7 +1341,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
unsigned long *p;
unsigned long *end;
- long delta = iter->tr->text_delta;
trace_assign_type(field, iter->ent);
end = (unsigned long *)((long)iter->ent + iter->ent_size);
@@ -1265,7 +1357,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
trace_seq_puts(s, "[FTRACE TRAMPOLINE]\n");
continue;
}
- seq_print_ip_sym(s, (*p) + delta, flags);
+ seq_print_ip_sym(s, trace_adjust_address(iter->tr, *p), flags);
trace_seq_putc(s, '\n');
}
@@ -1700,7 +1792,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags,
trace_assign_type(field, iter->ent);
- print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, flags);
+ print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, NULL, flags);
trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
trace_print_time(s, iter,
iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index dca40f1f1da4..2e305364f2a9 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -41,5 +41,14 @@ extern struct rw_semaphore trace_event_sem;
#define SEQ_PUT_HEX_FIELD(s, x) \
trace_seq_putmem_hex(s, &(x), sizeof(x))
+#ifdef CONFIG_FUNCTION_TRACE_ARGS
+void print_function_args(struct trace_seq *s, unsigned long *args,
+ unsigned long func);
+#else
+static inline void print_function_args(struct trace_seq *s, unsigned long *args,
+ unsigned long func) {
+ trace_seq_puts(s, "()");
+}
+#endif
#endif
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8f58ee1e8858..2eeecb6c95ee 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -770,6 +770,10 @@ static int check_prepare_btf_string_fetch(char *typename,
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+/*
+ * Add the entry code to store the 'argnum'th parameter and return the offset
+ * in the entry data buffer where the data will be stored.
+ */
static int __store_entry_arg(struct trace_probe *tp, int argnum)
{
struct probe_entry_arg *earg = tp->entry_arg;
@@ -793,6 +797,20 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum)
tp->entry_arg = earg;
}
+ /*
+ * The entry code array is repeating the pair of
+ * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)]
+ * and the rest of entries are filled with [FETCH_OP_END].
+ *
+ * To reduce the redundant function parameter fetching, we scan the entry
+ * code array to find the FETCH_OP_ARG which already fetches the 'argnum'
+ * parameter. If it doesn't match, update 'offset' to find the last
+ * offset.
+ * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we
+ * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and
+ * return data offset so that caller can find the data offset in the entry
+ * data buffer.
+ */
offset = 0;
for (i = 0; i < earg->size - 1; i++) {
switch (earg->code[i].op) {
@@ -826,6 +844,16 @@ int traceprobe_get_entry_data_size(struct trace_probe *tp)
if (!earg)
return 0;
+ /*
+ * earg->code[] array has an operation sequence which is run in
+ * the entry handler.
+ * The sequence stopped by FETCH_OP_END and each data stored in
+ * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA
+ * stores the data at the data buffer + its offset, and all data are
+ * "unsigned long" size. The offset must be increased when a data is
+ * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the
+ * code array.
+ */
for (i = 0; i < earg->size; i++) {
switch (earg->code[i].op) {
case FETCH_OP_END:
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 96792bc4b092..854e5668f5ee 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -545,6 +545,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(BAD_BTF_TID, "Failed to get BTF type info."),\
C(BAD_TYPE4STR, "This type does not fit for string."),\
C(NEED_STRING_TYPE, "$comm and immediate-string only accepts string type"),\
+ C(TOO_MANY_ARGS, "Too many arguments are specified"), \
C(TOO_MANY_EARGS, "Too many entry arguments specified"),
#undef C
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index af30586f1aea..a0db3404f7f7 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -188,8 +188,6 @@ static void wakeup_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
- else
- iter->private = NULL;
}
static void wakeup_trace_close(struct trace_iterator *iter)
@@ -242,7 +240,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
return;
local_irq_save(flags);
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, fregs);
local_irq_restore(flags);
atomic_dec(&data->disabled);
@@ -327,7 +325,7 @@ __trace_function(struct trace_array *tr,
if (is_graph(tr))
trace_graph_function(tr, ip, parent_ip, trace_ctx);
else
- trace_function(tr, ip, parent_ip, trace_ctx);
+ trace_function(tr, ip, parent_ip, trace_ctx, NULL);
}
static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index ccc762fbb69c..3386439ec9f6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -562,8 +562,14 @@ static int __trace_uprobe_create(int argc, const char **argv)
if (argc < 2)
return -ECANCELED;
- if (argc - 2 > MAX_TRACE_ARGS)
+
+ trace_probe_log_init("trace_uprobe", argc, argv);
+
+ if (argc - 2 > MAX_TRACE_ARGS) {
+ trace_probe_log_set_index(2);
+ trace_probe_log_err(0, TOO_MANY_ARGS);
return -E2BIG;
+ }
if (argv[0][1] == ':')
event = &argv[0][2];
@@ -582,7 +588,6 @@ static int __trace_uprobe_create(int argc, const char **argv)
return -ECANCELED;
}
- trace_probe_log_init("trace_uprobe", argc, argv);
trace_probe_log_set_index(1); /* filename is the 2nd argument */
*arg++ = '\0';
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1848ce7e2976..62719d2941c9 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -127,7 +127,7 @@ static void debug_print_probes(struct tracepoint_func *funcs)
return;
for (i = 0; funcs[i].func; i++)
- printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func);
+ printk(KERN_DEBUG "Probe %d : %pSb\n", i, funcs[i].func);
}
static struct tracepoint_func *
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index aa0b2e47f2f2..682f40d5632d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(__put_user_ns);
struct idmap_key {
bool map_up; /* true -> id from kid; false -> kid from id */
u32 id; /* id to find */
- u32 count; /* == 0 unless used with map_id_range_down() */
+ u32 count;
};
/*
@@ -343,16 +343,19 @@ u32 map_id_down(struct uid_gid_map *map, u32 id)
* UID_GID_MAP_MAX_BASE_EXTENTS.
*/
static struct uid_gid_extent *
-map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
+map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
unsigned idx;
- u32 first, last;
+ u32 first, last, id2;
+
+ id2 = id + count - 1;
/* Find the matching extent */
for (idx = 0; idx < extents; idx++) {
first = map->extent[idx].lower_first;
last = first + map->extent[idx].count - 1;
- if (id >= first && id <= last)
+ if (id >= first && id <= last &&
+ (id2 >= first && id2 <= last))
return &map->extent[idx];
}
return NULL;
@@ -363,28 +366,28 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
* Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
*/
static struct uid_gid_extent *
-map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
+map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
struct idmap_key key;
key.map_up = true;
- key.count = 1;
+ key.count = count;
key.id = id;
return bsearch(&key, map->reverse, extents,
sizeof(struct uid_gid_extent), cmp_map_id);
}
-u32 map_id_up(struct uid_gid_map *map, u32 id)
+u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count)
{
struct uid_gid_extent *extent;
unsigned extents = map->nr_extents;
smp_rmb();
if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
- extent = map_id_up_base(extents, map, id);
+ extent = map_id_range_up_base(extents, map, id, count);
else
- extent = map_id_up_max(extents, map, id);
+ extent = map_id_range_up_max(extents, map, id, count);
/* Map the id or note failure */
if (extent)
@@ -395,6 +398,11 @@ u32 map_id_up(struct uid_gid_map *map, u32 id)
return id;
}
+u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+ return map_id_range_up(map, id, 1);
+}
+
/**
* make_kuid - Map a user-namespace uid pair into a kuid.
* @ns: User namespace that the uid is in
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 5267adeaa403..7e45559521af 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -101,12 +101,11 @@ static bool post_one_notification(struct watch_queue *wqueue,
struct pipe_inode_info *pipe = wqueue->pipe;
struct pipe_buffer *buf;
struct page *page;
- unsigned int head, tail, mask, note, offset, len;
+ unsigned int head, tail, note, offset, len;
bool done = false;
spin_lock_irq(&pipe->rd_wait.lock);
- mask = pipe->ring_size - 1;
head = pipe->head;
tail = pipe->tail;
if (pipe_full(head, tail, pipe->ring_size))
@@ -124,7 +123,7 @@ static bool post_one_notification(struct watch_queue *wqueue,
memcpy(p + offset, n, len);
kunmap_atomic(p);
- buf = &pipe->bufs[head & mask];
+ buf = pipe_buf(pipe, head);
buf->page = page;
buf->private = (unsigned long)wqueue;
buf->ops = &watch_queue_pipe_buf_ops;
@@ -147,7 +146,7 @@ out:
return done;
lost:
- buf = &pipe->bufs[(head - 1) & mask];
+ buf = pipe_buf(pipe, head - 1);
buf->flags |= PIPE_BUF_FLAG_LOSS;
goto out;
}
@@ -269,6 +268,15 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
if (ret < 0)
goto error;
+ /*
+ * pipe_resize_ring() does not update nr_accounted for watch_queue
+ * pipes, because the above vastly overprovisions. Set nr_accounted on
+ * and max_usage this pipe to the number that was actually charged to
+ * the user above via account_pipe_buffers.
+ */
+ pipe->max_usage = nr_pages;
+ pipe->nr_accounted = nr_pages;
+
ret = -ENOMEM;
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b2da7de39d06..9fa2af9dbf2c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -347,8 +347,6 @@ static int __init watchdog_thresh_setup(char *str)
}
__setup("watchdog_thresh=", watchdog_thresh_setup);
-static void __lockup_detector_cleanup(void);
-
#ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
enum stats_per_group {
STATS_SYSTEM,
@@ -797,8 +795,7 @@ static void watchdog_enable(unsigned int cpu)
* Start the timer first to prevent the hardlockup watchdog triggering
* before the timer has a chance to fire.
*/
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- hrtimer->function = watchdog_timer_fn;
+ hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hrtimer_start(hrtimer, ns_to_ktime(sample_period),
HRTIMER_MODE_REL_PINNED_HARD);
@@ -886,11 +883,6 @@ static void __lockup_detector_reconfigure(void)
watchdog_hardlockup_start();
cpus_read_unlock();
- /*
- * Must be called outside the cpus locked section to prevent
- * recursive locking in the perf code.
- */
- __lockup_detector_cleanup();
}
void lockup_detector_reconfigure(void)
@@ -940,24 +932,6 @@ static inline void lockup_detector_setup(void)
}
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
-static void __lockup_detector_cleanup(void)
-{
- lockdep_assert_held(&watchdog_mutex);
- hardlockup_detector_perf_cleanup();
-}
-
-/**
- * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
- *
- * Caller must not hold the cpu hotplug rwsem.
- */
-void lockup_detector_cleanup(void)
-{
- mutex_lock(&watchdog_mutex);
- __lockup_detector_cleanup();
- mutex_unlock(&watchdog_mutex);
-}
-
/**
* lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
*
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 59c1d86a73a2..a78ff092d636 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -21,8 +21,6 @@
#include <linux/perf_event.h>
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-static DEFINE_PER_CPU(struct perf_event *, dead_event);
-static struct cpumask dead_events_mask;
static atomic_t watchdog_cpus = ATOMIC_INIT(0);
@@ -146,6 +144,7 @@ static int hardlockup_detector_event_create(void)
PTR_ERR(evt));
return PTR_ERR(evt);
}
+ WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak");
this_cpu_write(watchdog_ev, evt);
return 0;
}
@@ -181,37 +180,13 @@ void watchdog_hardlockup_disable(unsigned int cpu)
if (event) {
perf_event_disable(event);
+ perf_event_release_kernel(event);
this_cpu_write(watchdog_ev, NULL);
- this_cpu_write(dead_event, event);
- cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
atomic_dec(&watchdog_cpus);
}
}
/**
- * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
- *
- * Called from lockup_detector_cleanup(). Serialized by the caller.
- */
-void hardlockup_detector_perf_cleanup(void)
-{
- int cpu;
-
- for_each_cpu(cpu, &dead_events_mask) {
- struct perf_event *event = per_cpu(dead_event, cpu);
-
- /*
- * Required because for_each_cpu() reports unconditionally
- * CPU0 as set on UP kernels. Sigh.
- */
- if (event)
- perf_event_release_kernel(event);
- per_cpu(dead_event, cpu) = NULL;
- }
- cpumask_clear(&dead_events_mask);
-}
-
-/**
* hardlockup_detector_perf_stop - Globally stop watchdog events
*
* Special interface for x86 to handle the perf HT bug.